In [61]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

In [62]:
trainData = pd.read_csv("dataset/features.csv")
data_cols = np.array(trainData.columns)
data_cols

array(['letter_slant', 'line_slant', 'margin_slope', 'letter_size',
       'word_spacing', 'personality'], dtype=object)

In [63]:
trainData.head()

Unnamed: 0,letter_slant,line_slant,margin_slope,letter_size,word_spacing,personality
0,2.0,-75.1,4.1,931.7,15.0,Agreeableness
1,-11.0,9.6,15.3,486.2,22.0,Agreeableness
2,18.0,-2.3,1.0,112.8,1.0,Agreeableness
3,6.0,-4.0,-1.1,30.6,1.0,Agreeableness
4,-6.0,0.0,2.2,152.9,9.0,Agreeableness


In [64]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   letter_slant  221 non-null    float64
 1   line_slant    221 non-null    float64
 2   margin_slope  221 non-null    float64
 3   letter_size   221 non-null    float64
 4   word_spacing  221 non-null    float64
 5   personality   221 non-null    object 
dtypes: float64(5), object(1)
memory usage: 10.5+ KB


In [65]:
trainData.isnull().sum()

letter_slant    0
line_slant      0
margin_slope    0
letter_size     0
word_spacing    0
personality     0
dtype: int64

In [66]:
trainData = trainData.dropna(axis=0)
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   letter_slant  221 non-null    float64
 1   line_slant    221 non-null    float64
 2   margin_slope  221 non-null    float64
 3   letter_size   221 non-null    float64
 4   word_spacing  221 non-null    float64
 5   personality   221 non-null    object 
dtypes: float64(5), object(1)
memory usage: 10.5+ KB


In [67]:
y = trainData[data_cols[-1]]
x = trainData.drop(data_cols[-1], axis=1)

In [68]:
x

Unnamed: 0,letter_slant,line_slant,margin_slope,letter_size,word_spacing
0,2.0,-75.1,4.1,931.7,15.0
1,-11.0,9.6,15.3,486.2,22.0
2,18.0,-2.3,1.0,112.8,1.0
3,6.0,-4.0,-1.1,30.6,1.0
4,-6.0,0.0,2.2,152.9,9.0
...,...,...,...,...,...
216,11.0,-6.3,-6.2,190.4,9.0
217,5.0,-4.0,25.5,71.5,20.0
218,5.0,-1.1,-0.2,55.7,50.0
219,13.0,-5.9,-20.1,187.8,62.0


In [69]:
y

0      Agreeableness
1      Agreeableness
2      Agreeableness
3      Agreeableness
4      Agreeableness
           ...      
216         Openness
217         Openness
218         Openness
219         Openness
220         Openness
Name: personality, Length: 221, dtype: object

In [70]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, shuffle=True)

In [71]:
dtree = DecisionTreeClassifier()
dtree_res = dtree.fit(x_train, y_train)

In [72]:
y_pred = dtree.predict(x_test)
print(y_pred)

['Neuroticism' 'Openness' 'Neuroticism' 'Openness' 'Openness' 'Openness'
 'Openness' 'Conscientiousness' 'Openness' 'Openness' 'Conscientiousness'
 'Conscientiousness' 'Openness' 'Openness' 'Agreeableness' 'Neuroticism'
 'Agreeableness' 'Conscientiousness' 'Openness' 'Openness' 'Openness'
 'Openness' 'Conscientiousness']


In [73]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.391304347826087

In [74]:
dtree.feature_importances_

array([0.20688717, 0.25498754, 0.17241342, 0.20512184, 0.16059003])

In [75]:
import joblib
import os
joblib.dump(dtree, "./saved_models/decision_tree.joblib", compress=3)
print(f"Saved model size: {np.round(os.path.getsize('./saved_models/decision_tree.joblib') / 1024 / 1024, 3) } MB")

Saved model size: 0.003 MB
