# Importing Libraries

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.metrics import confusion_matrix, classification_report

# Importing Cleaned Data

In [6]:
data = pd.read_csv('./MIES_Dev_Data/cleaned_data.csv', index_col = 'Unnamed: 0')

In [7]:
data.head()

Unnamed: 0,Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,...,Q85A,Q86A,Q87A,Q88A,Q89A,Q90A,Q91A,gender,age,IE
0,5,3,1,2,3,2,3,3,4,5,...,1,4,2,5,4,3,3,2,23,3
1,5,5,1,5,2,2,5,2,1,3,...,2,1,3,4,4,4,3,1,25,2
2,3,4,5,3,4,5,5,5,5,5,...,5,4,5,3,2,1,1,1,19,1
3,5,2,1,1,5,5,5,4,4,2,...,5,3,5,4,4,3,3,1,23,1
4,1,2,1,1,3,3,5,1,3,4,...,1,3,1,2,5,5,5,1,18,2


# Grid Search

In [8]:
param_grid = {
    'C' : [0.1,1],
    'gamma' : [0.0001, 0.001, 0.01, 0.1],
    'kernel' : ['linear', 'rbf']
}

In [9]:
gs = GridSearchCV(SVC(), param_grid, verbose = 3, cv = 5)

In [10]:
gs.fit(data.drop(['IE'], axis = 1), data['IE'])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. C=0.1, gamma=0.0001, kernel=linear, score=0.630, total=  10.7s
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.7s remaining:    0.0s


[CV] .. C=0.1, gamma=0.0001, kernel=linear, score=0.677, total=  12.8s
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   23.5s remaining:    0.0s


[CV] .. C=0.1, gamma=0.0001, kernel=linear, score=0.796, total=  12.6s
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................
[CV] .. C=0.1, gamma=0.0001, kernel=linear, score=0.792, total=  12.8s
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................
[CV] .. C=0.1, gamma=0.0001, kernel=linear, score=0.817, total=  12.7s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.661, total=  15.7s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.677, total=  15.9s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.737, total=  16.2s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.743, total=  16.1s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] .

[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.630, total=  32.7s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.676, total=  36.6s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.795, total=  46.2s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.792, total=  51.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.816, total=  44.7s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.731, total=  11.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.797, total=  11.9s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 27.3min finished


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1], 'gamma': [0.0001, 0.001, 0.01, 0.1],
                         'kernel': ['linear', 'rbf']},
             verbose=3)

In [12]:
gs.best_params_

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

### Now let us get the results for a random test set for computing the metrics

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['IE'], axis = 1), data['IE'], test_size = 0.3)

In [14]:
svc = SVC(C = 1, gamma = 0.01, kernel = 'rbf')
svc.fit(X_train, y_train)

SVC(C=1, gamma=0.01)

In [15]:
predictions = svc.predict(X_test)

## Metric Analysis

In [16]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.84      0.88      0.86      1320
           2       0.95      0.96      0.95      1342
           3       0.87      0.82      0.84      1298

    accuracy                           0.89      3960
   macro avg       0.89      0.89      0.89      3960
weighted avg       0.89      0.89      0.89      3960



In [17]:
cm = pd.DataFrame(confusion_matrix(y_test, predictions), index = ['True_Introvert', 'True_Extravert', 'True_Ambivert'], columns = ['Predicted_Introvert', 'Predicted_Extravert', 'Predicted_Ambivert'])
cm

Unnamed: 0,Predicted_Introvert,Predicted_Extravert,Predicted_Ambivert
True_Introvert,1163,19,138
True_Extravert,31,1285,26
True_Ambivert,184,51,1063


### Most of the results are self-explanatory through the metrics, let us focus on blunders such as prediction results to an introvert for an extraverted data & vice versa

In [18]:
print("Probability of predicting as introvert for an extravert data : ", end = "")
print(cm['Predicted_Introvert']['True_Extravert'] / cm.loc['True_Extravert'].sum())

Probability of predicting as introvert for an extravert data : 0.023099850968703428


In [19]:
print("Probability of predicting as extravert for an introvert data : ", end = "")
print(cm['Predicted_Extravert']['True_Introvert'] / cm.loc['True_Introvert'].sum())

Probability of predicting as extravert for an introvert data : 0.014393939393939395


### Hence, the accuracy is very close to Random Forests
### And the data is negligibly imbalanced