In [1072]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,confusion_matrix

In [1073]:
trainData = pd.read_csv("dataset/my_features.csv")
data_cols = np.array(trainData.columns)
data_cols

array(['letter_slant', 'line_slant', 'letter_size', 'margin_slope',
       'word_spacing', 'personality'], dtype=object)

In [1074]:
trainData.head()

Unnamed: 0,letter_slant,line_slant,letter_size,margin_slope,word_spacing,personality
0,backward,upperside,1307.2,straight,small,Agreeableness
1,backward,upperside,932.0,right,small,Agreeableness
2,forward,upperside,891.4,straight,small,Agreeableness
3,forward,upperside,279.6,straight,small,Agreeableness
4,vertical,upperside,766.0,straight,small,Agreeableness


In [1075]:
letter_slant_mapping = {'backward': -1, 'forward': 1, 'vertical': 0}
line_slant_mapping = {'lowerside': -1, 'baseline': 0, 'upperside': 1}
margin_slope_mapping = {'left': -1, 'straight': 0, 'right': 1}
word_spacing_mapping = {'small': -1, 'medium': 0, 'large': 1}

In [1076]:
trainData["letter_slant"] = trainData["letter_slant"].map(letter_slant_mapping)
trainData["line_slant"] = trainData["line_slant"].map(line_slant_mapping)
trainData["margin_slope"] = trainData["margin_slope"].map(margin_slope_mapping)
trainData["word_spacing"] = trainData["word_spacing"].map(word_spacing_mapping)

In [1077]:
trainData.head()

Unnamed: 0,letter_slant,line_slant,letter_size,margin_slope,word_spacing,personality
0,-1,1,1307.2,0,-1,Agreeableness
1,-1,1,932.0,1,-1,Agreeableness
2,1,1,891.4,0,-1,Agreeableness
3,1,1,279.6,0,-1,Agreeableness
4,0,1,766.0,0,-1,Agreeableness


In [1078]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   letter_slant  221 non-null    int64  
 1   line_slant    221 non-null    int64  
 2   letter_size   221 non-null    float64
 3   margin_slope  221 non-null    int64  
 4   word_spacing  221 non-null    int64  
 5   personality   221 non-null    object 
dtypes: float64(1), int64(4), object(1)
memory usage: 10.5+ KB


In [1079]:
trainData.isnull().sum()

letter_slant    0
line_slant      0
letter_size     0
margin_slope    0
word_spacing    0
personality     0
dtype: int64

In [1080]:
trainData = trainData.dropna(axis=0)
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   letter_slant  221 non-null    int64  
 1   line_slant    221 non-null    int64  
 2   letter_size   221 non-null    float64
 3   margin_slope  221 non-null    int64  
 4   word_spacing  221 non-null    int64  
 5   personality   221 non-null    object 
dtypes: float64(1), int64(4), object(1)
memory usage: 10.5+ KB


In [1081]:
aggreableness_data = trainData[trainData["personality"] == "Agreeableness"]
conscientiousness_data = trainData[trainData["personality"] == "Conscientiousness"]
extraversion_data = trainData[trainData["personality"] == "Extraversion"]
neuroticism_data = trainData[trainData["personality"] == "Neuroticism"]
openness_data = trainData[trainData["personality"] == "Openness"]

In [1082]:
aggreableness_x = aggreableness_data.drop(data_cols[-1], axis=1)
aggreableness_y = aggreableness_data[data_cols[-1]]

conscientiousness_x = conscientiousness_data.drop(data_cols[-1], axis=1)
conscientiousness_y = conscientiousness_data[data_cols[-1]]

extraversion_x = extraversion_data.drop(data_cols[-1], axis=1)
extraversion_y = extraversion_data[data_cols[-1]]

neuroticism_x = neuroticism_data.drop(data_cols[-1], axis=1)
neuroticism_y = neuroticism_data[data_cols[-1]]

openness_x = openness_data.drop(data_cols[-1], axis=1)
openness_y = openness_data[data_cols[-1]]

In [1083]:
test_ratio = 0.1

aggreableness_x_train, aggreableness_x_test, aggreableness_y_train, aggreableness_y_test = train_test_split(aggreableness_x, aggreableness_y, test_size=test_ratio, shuffle=True)

conscientiousness_x_train, conscientiousness_x_test, conscientiousness_y_train, conscientiousness_y_test = train_test_split(conscientiousness_x, conscientiousness_y, test_size=test_ratio, shuffle=True)

extraversion_x_train, extraversion_x_test, extraversion_y_train, extraversion_y_test = train_test_split(extraversion_x, extraversion_y, test_size=test_ratio, shuffle=True)

neuroticism_x_train, neuroticism_x_test, neuroticism_y_train, neuroticism_y_test = train_test_split(neuroticism_x, neuroticism_y, test_size=test_ratio, shuffle=True)

openness_x_train, openness_x_test, openness_y_train, openness_y_test = train_test_split(openness_x, openness_y, test_size=test_ratio, shuffle=True)

x_train = pd.concat([aggreableness_x_train, conscientiousness_x_train, extraversion_x_train, neuroticism_x_train, openness_x_train], ignore_index=True)
y_train = pd.concat([aggreableness_y_train, conscientiousness_y_train, extraversion_y_train, neuroticism_y_train, openness_y_train], ignore_index=True)

x_test = pd.concat([aggreableness_x_test, conscientiousness_x_test, extraversion_x_test, neuroticism_x_test, openness_x_test], ignore_index=True)
y_test = pd.concat([aggreableness_y_test, conscientiousness_y_test, extraversion_y_test, neuroticism_y_test, openness_y_test], ignore_index=True)

In [1084]:
print(f"{x_train.shape, x_test.shape, y_train.shape, y_test.shape}")
print(f"Aggreableness: {aggreableness_x.shape, aggreableness_x_train.shape, aggreableness_x_test.shape}")
print(f"Conscientiousness: {conscientiousness_x.shape, conscientiousness_x_train.shape, conscientiousness_x_test.shape}")
print(f"Extraversion: {extraversion_x.shape, extraversion_x_train.shape, extraversion_x_test.shape}")
print(f"Neuroticism: {neuroticism_x.shape, neuroticism_x_train.shape, neuroticism_x_test.shape}")
print(f"Openness: {openness_x.shape, openness_x_train.shape, openness_x_test.shape}")

((197, 5), (24, 5), (197,), (24,))
Aggreableness: ((38, 5), (34, 5), (4, 5))
Conscientiousness: ((35, 5), (31, 5), (4, 5))
Extraversion: ((10, 5), (9, 5), (1, 5))
Neuroticism: ((44, 5), (39, 5), (5, 5))
Openness: ((94, 5), (84, 5), (10, 5))


In [1085]:
accuracies = []
for i in range(1, x_train.shape[0] + 1):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    accuracies.append(accuracy_score(y_test, y_pred))

In [1086]:
accuracies

[0.16666666666666666,
 0.20833333333333334,
 0.25,
 0.25,
 0.25,
 0.2916666666666667,
 0.25,
 0.2916666666666667,
 0.3333333333333333,
 0.2916666666666667,
 0.3333333333333333,
 0.3333333333333333,
 0.3333333333333333,
 0.375,
 0.4166666666666667,
 0.4583333333333333,
 0.5416666666666666,
 0.5833333333333334,
 0.5416666666666666,
 0.5416666666666666,
 0.5,
 0.5,
 0.5,
 0.375,
 0.4583333333333333,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4583333333333333,
 0.4166666666666667,
 0.4583333333333333,
 0.4583333333333333,
 0.4583333333333333,
 0.4583333333333333,
 0.4166666666666667,
 0.4583333333333333,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.4166666666666667,
 0.416666666

In [1087]:
max_accuracy = max(accuracies)
max_accuracy

0.5833333333333334

In [1088]:
max_index = accuracies.index(max_accuracy)

In [1089]:
# k = int(input("Enter number of neighbours: "))
knn = KNeighborsClassifier(n_neighbors=max_index+1)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)
y_pred

array(['Openness', 'Openness', 'Openness', 'Agreeableness',
       'Conscientiousness', 'Conscientiousness', 'Neuroticism',
       'Openness', 'Openness', 'Agreeableness', 'Openness', 'Neuroticism',
       'Neuroticism', 'Openness', 'Openness', 'Openness', 'Openness',
       'Openness', 'Openness', 'Openness', 'Openness',
       'Conscientiousness', 'Openness', 'Openness'], dtype=object)

In [1090]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5833333333333334

In [1091]:
import joblib
import os
joblib.dump(knn, "./saved_models/knn1.joblib", compress=3)
print(f"Saved model size: {np.round(os.path.getsize('./saved_models/knn1.joblib') / 1024 / 1024, 3) } MB")

Saved model size: 0.003 MB


In [1092]:
loaded_knn = joblib.load("./saved_models/knn1.joblib")
loaded_predict = loaded_knn.predict(x_test)
accuracy_score(y_test, loaded_predict)

0.5833333333333334