# Package import

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,confusion_matrix, ConfusionMatrixDisplay

# Reading dataset

In [None]:
df =  pd.concat([pd.read_csv(r"dataset\my_test_features.csv"),
                 pd.read_csv(r"dataset\my_train_features.csv")])

In [None]:
df.head()

# Data encoding

In [None]:
letter_slant_mapping = {'backward': -1, 'forward': 1, 'vertical': 0}
line_slant_mapping = {'lowerside': -1, 'baseline': 0, 'upperside': 1}
margin_slope_mapping = {'left': -1, 'straight': 0, 'right': 1}
word_spacing_mapping = {'small': -1, 'medium': 0, 'large': 1}

In [None]:
df["letter_slant"] = df["letter_slant"].map(letter_slant_mapping)
df["line_slant"] = df["line_slant"].map(line_slant_mapping)
df["margin_slope"] = df["margin_slope"].map(margin_slope_mapping)
df["word_spacing"] = df["word_spacing"].map(word_spacing_mapping)

In [None]:
df.head()

# Data cleaning

## Removing null values

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
# X = df.drop("personality",axis=1)
# y = df["personality"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)
# print(f"{X_train.shape, y_train.shape, X_test.shape, y_test.shape}")

## Splitting each personality into separate dataframe

In [None]:
aggreableness_data = df[df["personality"] == "Agreeableness"]
conscientiousness_data = df[df["personality"] == "Conscientiousness"]
extraversion_data = df[df["personality"] == "Extraversion"]
openness_data = df[df["personality"] == "Openness"]
neuroticism_data = df[df["personality"] == "Neuroticism"]

## Splitting features & classes for each dataframe (based on personality)

In [None]:
aggreableness_x = aggreableness_data.drop("personality", axis=1)
aggreableness_y = aggreableness_data["personality"]

conscientiousness_x = conscientiousness_data.drop("personality", axis=1)
conscientiousness_y = conscientiousness_data["personality"]

extraversion_x = extraversion_data.drop("personality", axis=1)
extraversion_y = extraversion_data["personality"]

openness_x = openness_data.drop("personality", axis=1)
openness_y = openness_data["personality"]

neuroticism_x = neuroticism_data.drop("personality", axis=1)
neuroticism_y = neuroticism_data["personality"]

## Splitting each dataframe info train & test data at 9:1
so that we can get data from each classes at same ratio (9:1) and no class got missed during random split

In [None]:
aggreableness_x_train, aggreableness_x_test, aggreableness_y_train, aggreableness_y_test = train_test_split(aggreableness_x, aggreableness_y, test_size=0.1, shuffle=True)

conscientiousness_x_train, conscientiousness_x_test, conscientiousness_y_train, conscientiousness_y_test = train_test_split(conscientiousness_x, conscientiousness_y, test_size=0.1, shuffle=True)

extraversion_x_train, extraversion_x_test, extraversion_y_train, extraversion_y_test = train_test_split(extraversion_x, extraversion_y, test_size=0.1, shuffle=True)

openness_x_train, openness_x_test, openness_y_train, openness_y_test = train_test_split(openness_x, openness_y, test_size=0.1, shuffle=True)

neuroticism_x_train, neuroticism_x_test, neuroticism_y_train, neuroticism_y_test = train_test_split(neuroticism_x, neuroticism_y, test_size=0.1, shuffle=True)

## Merging train & test data from each classes into single train & test data

In [None]:
X_train = pd.concat([
    aggreableness_x_train, 
    conscientiousness_x_train, 
    extraversion_x_train, 
    neuroticism_x_train, 
    openness_x_train
])

y_train = pd.concat([
    aggreableness_y_train, 
    conscientiousness_y_train, 
    extraversion_y_train, 
    neuroticism_y_train, 
    openness_y_train
])

X_test = pd.concat([
    aggreableness_x_test, 
    conscientiousness_x_test, 
    extraversion_x_test, 
    neuroticism_x_test, 
    openness_x_test
])

y_test = pd.concat([
    aggreableness_y_test, 
    conscientiousness_y_test, 
    extraversion_y_test, 
    neuroticism_y_test, 
    openness_y_test
])

In [None]:
print(f"{X_train.shape, y_train.shape, X_test.shape, y_test.shape}")

# Training Models

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, criterion="gini", random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_rf

## KNN

### Finding optimum number of neighbors (k)

In [None]:
accuracies = []
for i in range(1, X_train.shape[0] + 1):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))
max_accuracy_knn = max(accuracies)
max_index_knn = accuracies.index(max_accuracy_knn)
print(f"Best accuracy {round(max_accuracy_knn * 100, 2)}% for KNN with k = {max_index_knn + 1}")

### Training the KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=max_index_knn + 1)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_pred_knn

# Output

## Accuracy score

### Accuracy for Random Forest

In [None]:
print(f"Accuracy for Random Forest: {round(accuracy_score(y_test, y_pred_rf) * 100, 2)}%")

### Accuracy for KNN

In [None]:
print(f"Accuracy for KNN: {round(accuracy_score(y_test, y_pred_knn) * 100, 2)}%")

## Confusion Matrix

In [None]:
import matplotlib.pyplot as plt

cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_knn = confusion_matrix(y_test, y_pred_knn)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rf.classes_)
disp1.plot(ax=ax1, xticks_rotation="vertical")
ax1.set_title(f"Random Forest (Accuracy: {round(accuracy_score(y_test, y_pred_rf) * 100, 2)}%)")

disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_knn, display_labels=knn.classes_)
disp2.plot(ax=ax2, xticks_rotation="vertical")
ax2.set_title(f"KNN (Accuracy: {round(accuracy_score(y_test, y_pred_knn) * 100, 2)}%)")

fig.suptitle("Confusion matrices")
plt.tight_layout()
plt.show()


## Assessment metrics

In [None]:
from sklearn.metrics import classification_report

### Random Forest

In [None]:
report1 = classification_report(y_test, y_pred_rf, output_dict=True)
report1_df = pd.DataFrame(report1).transpose()
report1_df

### KNN

In [None]:
report2 = classification_report(y_test, y_pred_knn, output_dict=True)
report2_df = pd.DataFrame(report2).transpose()
report2_df

# Saving the trained model & currently splitted train & test data for further use

In [None]:
import joblib
import os
joblib.dump(rf, "./saved_models/type2/rf.joblib", compress=3)
print(f"Random forest model saved: {np.round(os.path.getsize('./saved_models/type2/rf.joblib') / 1024 / 1024, 2) } MB")
joblib.dump(knn, "./saved_models/type2/knn.joblib", compress=3)
print(f"KNN model saved: {np.round(os.path.getsize('./saved_models/type2/knn.joblib') / 1024 / 1024, 2) } MB")

df_train_save = pd.concat([X_train, y_train], axis=1)
df_train_save.to_csv("./saved_models/type2/train.csv", index=False)
df_test_save = pd.concat([X_test, y_test], axis=1)
df_test_save.to_csv("./saved_models/type2/test.csv", index=False)