# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.metrics import confusion_matrix, classification_report

# Importing Cleaned Data

In [3]:
data = pd.read_csv('./MIES_Dev_Data/cleaned_data.csv', index_col = 'Unnamed: 0')

In [4]:
data.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,Q85A,Q86A,Q87A,Q88A,Q89A,Q90A,Q91A,gender,age,IE
0,5,3,1,2,3,2,3,3,4,5,...,1,4,2,5,4,3,3,2,23,3
1,5,5,1,5,2,2,5,2,1,3,...,2,1,3,4,4,4,3,1,25,2
2,3,4,5,3,4,5,5,5,5,5,...,5,4,5,3,2,1,1,1,19,1
3,5,2,1,1,5,5,5,4,4,2,...,5,3,5,4,4,3,3,1,23,1
4,1,2,1,1,3,3,5,1,3,4,...,1,3,1,2,5,5,5,1,18,2


## A quick glance at the average score

In [9]:
cvs = cross_val_score(RandomForestClassifier(500), data.drop(['IE'], axis = 1), data['IE'], cv = 5)
print("Cross Validation Scores :", cvs)
print("Average Score :", cvs.mean())

Cross Validation Scores : [0.73901515 0.78181818 0.88707844 0.90337249 0.90261463]
Average Score : 0.8427797776935708


### Now let us get the results for a random test set for computing the metrics

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['IE'], axis = 1), data['IE'], test_size = 0.3)

In [11]:
rf = RandomForestClassifier(500)
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500)

In [12]:
predictions = rf.predict(X_test)

## Metric Analysis

In [13]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.83      0.86      0.85      1313
           2       0.91      0.92      0.91      1271
           3       0.82      0.78      0.80      1376

    accuracy                           0.85      3960
   macro avg       0.85      0.85      0.85      3960
weighted avg       0.85      0.85      0.85      3960



In [14]:
cm = pd.DataFrame(confusion_matrix(y_test, predictions), index = ['True_Introvert', 'True_Extravert', 'True_Ambivert'], columns = ['Predicted_Introvert', 'Predicted_Extravert', 'Predicted_Ambivert'])
cm

Unnamed: 0,Predicted_Introvert,Predicted_Extravert,Predicted_Ambivert
True_Introvert,1134,22,157
True_Extravert,24,1172,75
True_Ambivert,205,99,1072


### Most of the results are self-explanatory through the metrics, let us focus on blunders such as prediction results to an introvert for an extraverted data & vice versa

In [15]:
print("Probability of predicting as introvert for an extravert data : ", end = "")
print(cm['Predicted_Introvert']['True_Extravert'] / cm.loc['True_Extravert'].sum())

Probability of predicting as introvert for an extravert data : 0.01888276947285602


In [16]:
print("Probability of predicting as extravert for an introvert data : ", end = "")
print(cm['Predicted_Extravert']['True_Introvert'] / cm.loc['True_Introvert'].sum())

Probability of predicting as extravert for an introvert data : 0.016755521706016754


### As we can see, the prediction is balanced & much accurate compared to Logistic Regression