# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report

# Importing Cleaned Data

In [2]:
data = pd.read_csv('./MIES_Dev_Data/cleaned_data.csv', index_col = 'Unnamed: 0')

In [3]:
data.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,Q85A,Q86A,Q87A,Q88A,Q89A,Q90A,Q91A,gender,age,IE
0,5,3,1,2,3,2,3,3,4,5,...,1,4,2,5,4,3,3,2,23,3
1,5,5,1,5,2,2,5,2,1,3,...,2,1,3,4,4,4,3,1,25,2
2,3,4,5,3,4,5,5,5,5,5,...,5,4,5,3,2,1,1,1,19,1
3,5,2,1,1,5,5,5,4,4,2,...,5,3,5,4,4,3,3,1,23,1
4,1,2,1,1,3,3,5,1,3,4,...,1,3,1,2,5,5,5,1,18,2


# A quick glance at the average score

In [4]:
cvs = cross_val_score(LogisticRegression(max_iter = 4000), data.drop(['IE'], axis = 1), data['IE'], cv = 5)
print("Cross Validation Scores :", cvs)
print("Average Score :", cvs.mean())

Cross Validation Scores : [0.63712121 0.67537879 0.78097764 0.80560818 0.79537704]
Average Score : 0.738892572944297


### Now let us get the results for a random test set for computing the metrics

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['IE'], axis = 1), data['IE'], test_size = 0.3)

In [6]:
lr = LogisticRegression(max_iter = 4000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=4000)

In [7]:
predictions = lr.predict(X_test)

## Metric Analysis

In [8]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.80      0.82      0.81      1301
           2       0.81      0.84      0.83      1360
           3       0.67      0.62      0.65      1299

    accuracy                           0.76      3960
   macro avg       0.76      0.76      0.76      3960
weighted avg       0.76      0.76      0.76      3960



In [10]:
pd.DataFrame(confusion_matrix(y_test, predictions), index = ['True_Introvert', 'True_Extravert', 'True_Ambivert'], columns = ['Predicted_Introvert', 'Predicted_Extravert', 'Predicted_Ambivert'])

Unnamed: 0,Predicted_Introvert,Predicted_Extravert,Predicted_Ambivert
True_Introvert,1070,22,209
True_Extravert,23,1142,195
True_Ambivert,247,241,811
