# Importing packages

In [1]:
import pandas as pd
import numpy as np

# Loading Dataset

In [2]:
df = pd.read_csv("./dataset/features.csv")
df.head()

Unnamed: 0,base_line,letter_size,line_spacing,word_spacing,top_margin,pen_pressure,slant_of_letters,labels
0,-0.39,5.7,12.29,4.37,2.46,171.38,-15,Openness
1,0.05,3.07,17.29,2.89,1.74,194.71,-15,Conscientiousness
2,-1.1,2.02,16.36,3.14,1.65,170.29,-15,Agreeableness
3,-0.01,1.91,15.73,3.32,1.6,165.56,180,Neuroticism
4,0.0,2.1,15.7,3.25,1.69,171.55,-15,Agreeableness


### checking dataset information

In [3]:
df.shape

(5, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   base_line         5 non-null      float64
 1   letter_size       5 non-null      float64
 2   line_spacing      5 non-null      float64
 3   word_spacing      5 non-null      float64
 4   top_margin        5 non-null      float64
 5   pen_pressure      5 non-null      float64
 6   slant_of_letters  5 non-null      int64  
 7   labels            5 non-null      object 
dtypes: float64(6), int64(1), object(1)
memory usage: 448.0+ bytes


### showing number of null values in each features (columns)

In [5]:
df.isnull().sum()

base_line           0
letter_size         0
line_spacing        0
word_spacing        0
top_margin          0
pen_pressure        0
slant_of_letters    0
labels              0
dtype: int64

### drop the null values

In [6]:
df.dropna(inplace=True)
df.isnull().sum()

base_line           0
letter_size         0
line_spacing        0
word_spacing        0
top_margin          0
pen_pressure        0
slant_of_letters    0
labels              0
dtype: int64

# Feature Engineering

In [7]:
X = df.drop("labels", axis=1)
Y = df.labels

In [8]:
X.head()

Unnamed: 0,base_line,letter_size,line_spacing,word_spacing,top_margin,pen_pressure,slant_of_letters
0,-0.39,5.7,12.29,4.37,2.46,171.38,-15
1,0.05,3.07,17.29,2.89,1.74,194.71,-15
2,-1.1,2.02,16.36,3.14,1.65,170.29,-15
3,-0.01,1.91,15.73,3.32,1.6,165.56,180
4,0.0,2.1,15.7,3.25,1.69,171.55,-15


In [9]:
Y.head()

0             Openness
1    Conscientiousness
2        Agreeableness
3          Neuroticism
4        Agreeableness
Name: labels, dtype: object

# Split train and test data

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [13]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3, 7)
(3,)
(2, 7)
(2,)


# Training Random Forest Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, criterion="gini", random_state=42)
rf.fit(x_train, y_train)

In [29]:
y_pred = rf.predict(x_test)
y_pred

array(['Agreeableness', 'Agreeableness'], dtype=object)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5

In [31]:
rf.feature_importances_

array([0.18823529, 0.19411765, 0.14117647, 0.10588235, 0.1       ,
       0.17647059, 0.09411765])