# Myers Brigss ML Project: SVM Model

### Imports & Dependencies

In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
# import the cleaned df
cleaned_df = pd.read_csv('../../Resources/data/cleaned_mbti.csv', index_col=0)
cleaned_df.head()

Unnamed: 0,type,posts,http_count,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,24,' and intj moments sportscenter not top ten...,' and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,10,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,5,"'Good one _____ course, to which I say I k...","'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",2,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,6,'You're fired.|||That's another silly misconce...,'You're fired. That's another silly misconcept...


### Split into 4 dataframes for each boolean dimension

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [4]:
E_I_df = cleaned_df.copy()
E_I_df['E-I'] = cleaned_df["type"].str.extract('(.)[N,S]',1)
E_I_df['Encoded'] = le.fit_transform(E_I_df['E-I'])
E_I_df = E_I_df[['Encoded', 'E-I', 'type', 'text']]
E_I_df.head()

Unnamed: 0,Encoded,E-I,type,text
0,1,I,INFJ,' and intj moments sportscenter not top ten...
1,0,E,ENTP,'I'm finding the lack of me in these posts ver...
2,1,I,INTP,"'Good one _____ course, to which I say I k..."
3,1,I,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,0,E,ENTJ,'You're fired. That's another silly misconcept...


In [5]:
N_S_df = cleaned_df.copy()
N_S_df['N-S'] = cleaned_df["type"].str.extract('[E,I](.)[F,T]',1)
N_S_df['Encoded'] = le.fit_transform(N_S_df['N-S'])
N_S_df = N_S_df[['Encoded', 'N-S', 'type', 'text']]
N_S_df.head()

Unnamed: 0,Encoded,N-S,type,text
0,0,N,INFJ,' and intj moments sportscenter not top ten...
1,0,N,ENTP,'I'm finding the lack of me in these posts ver...
2,0,N,INTP,"'Good one _____ course, to which I say I k..."
3,0,N,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,0,N,ENTJ,'You're fired. That's another silly misconcept...


In [6]:
T_F_df = cleaned_df.copy()
T_F_df['T-F'] = cleaned_df["type"].str.extract('[N,S](.)[J,P]',1)
T_F_df['Encoded'] = le.fit_transform(T_F_df['T-F'])
T_F_df = T_F_df[['Encoded', 'T-F', 'type', 'text']]
T_F_df.head()

Unnamed: 0,Encoded,T-F,type,text
0,0,F,INFJ,' and intj moments sportscenter not top ten...
1,1,T,ENTP,'I'm finding the lack of me in these posts ver...
2,1,T,INTP,"'Good one _____ course, to which I say I k..."
3,1,T,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,1,T,ENTJ,'You're fired. That's another silly misconcept...


In [7]:
J_P_df = cleaned_df.copy()
J_P_df['J-P'] = cleaned_df["type"].str.extract('[F,T](.)',1)
J_P_df['Encoded'] = le.fit_transform(J_P_df['J-P'])
J_P_df = J_P_df[['Encoded', 'J-P', 'type','text']]
J_P_df.head()

Unnamed: 0,Encoded,J-P,type,text
0,0,J,INFJ,' and intj moments sportscenter not top ten...
1,1,P,ENTP,'I'm finding the lack of me in these posts ver...
2,1,P,INTP,"'Good one _____ course, to which I say I k..."
3,0,J,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,0,J,ENTJ,'You're fired. That's another silly misconcept...


### TFIDF Vectorizer for 4 dimensions

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from mlxtend.preprocessing import DenseTransformer
# from sklearn.preprocessing import KBinsDiscretizer


In [9]:
#Setting up Vectorizer
vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words="english",
    ngram_range=(1,3),
)

# cvt = CountVectorizer(
#     max_features=17000,
#     min_df=7,
#     max_df=0.8,
#     stop_words="english",
#     ngram_range=(1,3)
# )

# est = KBinsDiscretizer(n_bins=2, strategy="uniform")

# tfid = TfidfTransformer()

# dtf = DenseTransformer()

In [10]:
# E-I
X = E_I_df["text"].values
y = E_I_df["Encoded"].values
X_train_EI, X_test_EI, y_train_EI, y_test_EI = train_test_split(X, y, random_state=42)

X_train_EI = vectorizer.fit_transform(X_train_EI)
X_test_EI = vectorizer.transform(X_test_EI)

# # E-I
# X_train_EI = cvt.fit_transform(X_train_EI)
# X_test_EI = cvt.transform(X_test_EI)

# X_train_EI = dtf.fit_transform(X_train_EI)
# X_test_EI = dtf.transform(X_test_EI)

# X_train_EI = est.fit_transform(X_train_EI)
# X_test_EI = est.transform(X_test_EI)

# X_train_EI = tfid.fit_transform(X_train_EI)
# X_test_EI = tfid.transform(X_test_EI)

In [11]:
# N-S
X = N_S_df["text"].values
y = N_S_df["Encoded"].values
X_train_NS, X_test_NS, y_train_NS, y_test_NS = train_test_split(X, y, random_state=42)

X_train_NS = vectorizer.fit_transform(X_train_NS)
X_test_NS = vectorizer.transform(X_test_NS)

# N-S
# X_train_NS = cvt.fit_transform(X_train_NS)
# X_test_NS = cvt.transform(X_test_NS)

# X_train_NS = dtf.fit_transform(X_train_NS)
# X_test_NS = dtf.transform(X_test_NS)

# X_train_NS = est.fit_transform(X_train_NS)
# X_test_NS = est.transform(X_test_NS)

# X_train_NS = tfid.fit_transform(X_train_NS)
# X_test_NS = tfid.transform(X_test_NS)

In [12]:
# T-F
X = T_F_df["text"].values
y = T_F_df["Encoded"].values
X_train_TF, X_test_TF, y_train_TF, y_test_TF = train_test_split(X, y, random_state=42)

X_train_TF = vectorizer.fit_transform(X_train_TF)
X_test_TF = vectorizer.transform(X_test_TF)

# T-F
# X_train_TF = cvt.fit_transform(X_train_TF)
# X_test_TF = cvt.transform(X_test_TF)

# X_train_TF = dtf.fit_transform(X_train_TF)
# X_test_TF = dtf.transform(X_test_TF)

# X_train_TF = est.fit_transform(X_train_TF)
# X_test_TF = est.transform(X_test_TF)

# X_train_TF = tfid.fit_transform(X_train_TF)
# X_test_TF = tfid.transform(X_test_TF)

In [13]:
# J-P
X = J_P_df["text"].values
y = J_P_df["Encoded"].values
X_train_JP, X_test_JP, y_train_JP, y_test_JP = train_test_split(X, y, random_state=42)

X_train_JP = vectorizer.fit_transform(X_train_JP)
X_test_JP = vectorizer.transform(X_test_JP)

# J-P
# X_train_JP = cvt.fit_transform(X_train_JP)
# X_test_JP = cvt.transform(X_test_JP)

# X_train_JP = dtf.fit_transform(X_train_JP)
# X_test_JP = dtf.transform(X_test_JP)

# X_train_JP = est.fit_transform(X_train_JP)
# X_test_JP = est.transform(X_test_JP)

# X_train_JP = tfid.fit_transform(X_train_JP)
# X_test_JP = tfid.transform(X_test_JP)

### 4 Model for 4 dimensions

In [14]:
from sklearn import svm

In [15]:
# E-I
clf_EI = svm.LinearSVC(random_state=0, max_iter=2000, C=0.79)
clf_EI.fit(X_train_EI, y_train_EI)

# Predict outcomes for E-I test data set
predictions_EI = clf_EI.predict(X_test_EI)
clf_EI_pred = pd.DataFrame({"Prediction": predictions_EI, "Actual": y_test_EI})
clf_EI_pred.head()

Unnamed: 0,Prediction,Actual
0,0,1
1,1,1
2,1,1
3,0,0
4,1,0


In [16]:
# N-S
clf_NS = svm.LinearSVC(random_state=0, max_iter=2000, C=0.79)
clf_NS.fit(X_train_NS, y_train_NS)

# Predict outcomes for N-S test data set
predictions_NS = clf_NS.predict(X_test_NS)
clf_NS_pred = pd.DataFrame({"Prediction": predictions_NS, "Actual": y_test_NS})
clf_NS_pred.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [17]:
# T-F
clf_TF = svm.LinearSVC(random_state=0, max_iter=2000, C=0.79)
clf_TF.fit(X_train_TF, y_train_TF)

# Predict outcomes for T-F test data set
predictions_TF = clf_TF.predict(X_test_TF)
clf_TF_pred = pd.DataFrame({"Prediction": predictions_TF, "Actual": y_test_TF})
clf_TF_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,0
4,1,1


In [18]:
# J-P
clf_JP = svm.LinearSVC(random_state=0, max_iter=2000, C=0.79)
clf_JP.fit(X_train_JP, y_train_JP)

# Predict outcomes for J-P test data set
predictions_JP = clf_JP.predict(X_test_JP)
clf_JP_pred = pd.DataFrame({"Prediction": predictions_JP, "Actual": y_test_JP})
clf_JP_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,0,1
4,0,0


### Accuracies for 4 Models

In [19]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [20]:
accEI = clf_EI.score(X_test_EI, y_test_EI)
accNS = clf_NS.score(X_test_NS, y_test_NS)
accTF = clf_TF.score(X_test_TF, y_test_TF)
accJP = clf_JP.score(X_test_JP, y_test_JP)
print("Accuracy of model at predicting MB types were: \n EI:{accEI} \n NS:{accNS} \n TF: {accTF} \n JP: {accJP}".format(accEI=accEI, accNS=accNS, accTF=accTF, accJP=accJP ))

Accuracy of model at predicting MB types were: 
 EI:0.855232826187183 
 NS:0.900875979714154 
 TF: 0.8409405255878285 
 JP: 0.7939142461964038


In [21]:
print("E-I SVM Model")
print("Classification Report: \n")
print(classification_report(clf_EI_pred.Actual, clf_EI_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_EI_pred.Actual, clf_EI_pred.Prediction))

E-I SVM Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.75      0.51      0.60       473
           1       0.87      0.95      0.91      1696

    accuracy                           0.86      2169
   macro avg       0.81      0.73      0.76      2169
weighted avg       0.85      0.86      0.84      2169

Confusion Matrix: 

[[ 239  234]
 [  80 1616]]


In [22]:
print("N-S SVM Model")
print("Classification Report: \n")
print(classification_report(clf_NS_pred.Actual, clf_NS_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_NS_pred.Actual, clf_NS_pred.Prediction))

N-S SVM Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.91      0.99      0.94      1862
           1       0.83      0.38      0.52       307

    accuracy                           0.90      2169
   macro avg       0.87      0.68      0.73      2169
weighted avg       0.89      0.90      0.88      2169

Confusion Matrix: 

[[1838   24]
 [ 191  116]]


In [23]:
print("T-F SVM Model")
print("Classification Report: \n")
print(classification_report(clf_TF_pred.Actual, clf_TF_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_TF_pred.Actual, clf_TF_pred.Prediction))

T-F SVM Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      1179
           1       0.82      0.83      0.83       990

    accuracy                           0.84      2169
   macro avg       0.84      0.84      0.84      2169
weighted avg       0.84      0.84      0.84      2169

Confusion Matrix: 

[[1002  177]
 [ 168  822]]


In [24]:
print("J-P SVM Model")
print("Classification Report: \n")
print(classification_report(clf_JP_pred.Actual, clf_JP_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_JP_pred.Actual, clf_JP_pred.Prediction))

J-P SVM Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.77      0.67      0.72       842
           1       0.81      0.87      0.84      1327

    accuracy                           0.79      2169
   macro avg       0.79      0.77      0.78      2169
weighted avg       0.79      0.79      0.79      2169

Confusion Matrix: 

[[ 567  275]
 [ 172 1155]]
