# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report

# Importing balanced Data

In [2]:
data = pd.read_csv('./MIES_Dev_Data/cleaned_data.csv', index_col = 'Unnamed: 0')

In [3]:
data.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,Q85A,Q86A,Q87A,Q88A,Q89A,Q90A,Q91A,gender,age,IE
0,5,3,1,2,3,2,3,3,4,5,...,1,4,2,5,4,3,3,2,23,3
1,5,5,1,5,2,2,5,2,1,3,...,2,1,3,4,4,4,3,1,25,2
2,3,4,5,3,4,5,5,5,5,5,...,5,4,5,3,2,1,1,1,19,1
3,5,2,1,1,5,5,5,4,4,2,...,5,3,5,4,4,3,3,1,23,1
4,1,2,1,1,3,3,5,1,3,4,...,1,3,1,2,5,5,5,1,18,2


## Converting labels to \[0, numclass)  

In [4]:
data['IE'] = data['IE'].apply(lambda x : x % 3)

## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['IE'], axis = 1), data['IE'], test_size = 0.3)
train = xgb.DMatrix(X_train, label = y_train)
test = xgb.DMatrix(X_test, label = y_test)

## XGBoost

In [6]:
model = xgb.train({ 'eta' : 0.2, 'objective' : 'multi:softmax', 'num_class' : 3 }, train, 100)

In [7]:
predictions = model.predict(test)

## Metric Analysis

In [8]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.78      0.80      1364
           1       0.83      0.86      0.84      1292
           2       0.92      0.93      0.93      1304

    accuracy                           0.86      3960
   macro avg       0.86      0.86      0.86      3960
weighted avg       0.86      0.86      0.86      3960



In [9]:
cm = pd.DataFrame(confusion_matrix(y_test, predictions), index = ['True_Introvert', 'True_Extravert', 'True_Ambivert'], columns = ['Predicted_Introvert', 'Predicted_Extravert', 'Predicted_Ambivert'])
cm

Unnamed: 0,Predicted_Introvert,Predicted_Extravert,Predicted_Ambivert
True_Introvert,1063,209,92
True_Extravert,161,1114,17
True_Ambivert,60,27,1217


### Most of the results are self-explanatory through the metrics, let us focus on blunders such as prediction results to an introvert for an extraverted data & vice versa

In [10]:
print("Probability of predicting as introvert for an extravert data : ", end = "")
print(cm['Predicted_Introvert']['True_Extravert'] / cm.loc['True_Extravert'].sum())

Probability of predicting as introvert for an extravert data : 0.12461300309597523


In [11]:
print("Probability of predicting as extravert for an introvert data : ", end = "")
print(cm['Predicted_Extravert']['True_Introvert'] / cm.loc['True_Introvert'].sum())

Probability of predicting as extravert for an introvert data : 0.1532258064516129


### Although we receive a good overall accuracy, the edge case predictions are blunders!