# Myers Brigss ML Project: SVM Model

### Imports & Dependencies

In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
# import the cleaned df
cleaned_df = pd.read_csv('../../Resources/data/cleaned_mbti.csv', index_col=0)
cleaned_df.head()

Unnamed: 0,type,posts,http_count,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,24,' and intj moments sportscenter not top ten...,' and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,10,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,5,"'Good one _____ course, to which I say I k...","'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",2,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,6,'You're fired.|||That's another silly misconce...,'You're fired. That's another silly misconcept...


### Split into 4 dataframes for each boolean dimension

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [4]:
E_I_df = cleaned_df.copy()
E_I_df['E-I'] = cleaned_df["type"].str.extract('(.)[N,S]',1)
E_I_df['Encoded'] = le.fit_transform(E_I_df['E-I'])
E_I_df = E_I_df[['Encoded', 'E-I', 'type', 'text']]
E_I_df.head()

Unnamed: 0,Encoded,E-I,type,text
0,1,I,INFJ,' and intj moments sportscenter not top ten...
1,0,E,ENTP,'I'm finding the lack of me in these posts ver...
2,1,I,INTP,"'Good one _____ course, to which I say I k..."
3,1,I,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,0,E,ENTJ,'You're fired. That's another silly misconcept...


In [5]:
N_S_df = cleaned_df.copy()
N_S_df['N-S'] = cleaned_df["type"].str.extract('[E,I](.)[F,T]',1)
N_S_df['Encoded'] = le.fit_transform(N_S_df['N-S'])
N_S_df = N_S_df[['Encoded', 'N-S', 'type', 'text']]
N_S_df.head()

Unnamed: 0,Encoded,N-S,type,text
0,0,N,INFJ,' and intj moments sportscenter not top ten...
1,0,N,ENTP,'I'm finding the lack of me in these posts ver...
2,0,N,INTP,"'Good one _____ course, to which I say I k..."
3,0,N,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,0,N,ENTJ,'You're fired. That's another silly misconcept...


In [6]:
T_F_df = cleaned_df.copy()
T_F_df['T-F'] = cleaned_df["type"].str.extract('[N,S](.)[J,P]',1)
T_F_df['Encoded'] = le.fit_transform(T_F_df['T-F'])
T_F_df = T_F_df[['Encoded', 'T-F', 'type', 'text']]
T_F_df.head()

Unnamed: 0,Encoded,T-F,type,text
0,0,F,INFJ,' and intj moments sportscenter not top ten...
1,1,T,ENTP,'I'm finding the lack of me in these posts ver...
2,1,T,INTP,"'Good one _____ course, to which I say I k..."
3,1,T,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,1,T,ENTJ,'You're fired. That's another silly misconcept...


In [7]:
J_P_df = cleaned_df.copy()
J_P_df['J-P'] = cleaned_df["type"].str.extract('[F,T](.)',1)
J_P_df['Encoded'] = le.fit_transform(J_P_df['J-P'])
J_P_df = J_P_df[['Encoded', 'J-P', 'type','text']]
J_P_df.head()

Unnamed: 0,Encoded,J-P,type,text
0,0,J,INFJ,' and intj moments sportscenter not top ten...
1,1,P,ENTP,'I'm finding the lack of me in these posts ver...
2,1,P,INTP,"'Good one _____ course, to which I say I k..."
3,0,J,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,0,J,ENTJ,'You're fired. That's another silly misconcept...


### TFIDF Vectorizer for 4 dimensions

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [9]:
#Setting up Vectorizer
vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words="english",
    ngram_range=(1,3),
)

In [10]:
# E-I
X = E_I_df["text"].values
y = E_I_df["Encoded"].values
X_train_EI, X_test_EI, y_train_EI, y_test_EI = train_test_split(X, y, random_state=42)
X_train_EI = vectorizer.fit_transform(X_train_EI)
X_test_EI = vectorizer.transform(X_test_EI)

In [11]:
# N-S
X = N_S_df["text"].values
y = N_S_df["Encoded"].values
X_train_NS, X_test_NS, y_train_NS, y_test_NS = train_test_split(X, y, random_state=42)
X_train_NS = vectorizer.fit_transform(X_train_NS)
X_test_NS = vectorizer.transform(X_test_NS)

In [12]:
# T-F
X = T_F_df["text"].values
y = T_F_df["Encoded"].values
X_train_TF, X_test_TF, y_train_TF, y_test_TF = train_test_split(X, y, random_state=42)
X_train_TF = vectorizer.fit_transform(X_train_TF)
X_test_TF = vectorizer.transform(X_test_TF)

In [13]:
# J-P
X = J_P_df["text"].values
y = J_P_df["Encoded"].values
X_train_JP, X_test_JP, y_train_JP, y_test_JP = train_test_split(X, y, random_state=42)
X_train_JP = vectorizer.fit_transform(X_train_JP)
X_test_JP = vectorizer.transform(X_test_JP)

### KBinsDiscretizer

In [64]:
from sklearn.preprocessing import KBinsDiscretizer
from mlxtend.preprocessing import DenseTransformer

ModuleNotFoundError: No module named 'mlxtend'

In [62]:
est = KBinsDiscretizer(
    encode='ordinal', 
    strategy='uniform'
)

X_train_EI.todense()
X_test_EI.todense()

X_train_NS.todense()
X_test_NS.todense()

X_train_TF.todense()
X_test_TF.todense()

X_train_JP.todense()
X_test_JP.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
# E-I
X_train_EI = est.fit_transform(X_train_EI)
X_test_EI = est.transform(X_test_EI)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
# N-S
X_train_NS = est.fit_transform(X_train_NS)
X_test_NS = est.transform(X_test_NS)

In [None]:
# T-F
X_train_TF = est.fit_transform(X_train_TF)
X_test_TF = est.transform(X_test_TF)

In [None]:
# J-P
X_train_JP = est.fit_transform(X_train_JP)
X_test_JP = est.transform(X_test_JP)

### 4 Model for 4 dimensions

In [None]:
from sklearn import svm

In [None]:
# E-I
clf_EI = svm.LinearSVC(random_state=0, tol=1e-6, max_iter=2000, C=0.79)
clf_EI.fit(X_train_EI, y_train_EI)

# Predict outcomes for E-I test data set
predictions_EI = clf_EI.predict(X_test_EI)
clf_EI_pred = pd.DataFrame({"Prediction": predictions_EI, "Actual": y_test_EI})
clf_EI_pred.head()

In [None]:
# N-S
clf_NS = svm.LinearSVC(random_state=0, tol=1e-6, max_iter=2000, C=0.79)
clf_NS.fit(X_train_NS, y_train_NS)

# Predict outcomes for N-S test data set
predictions_NS = clf_NS.predict(X_test_NS)
clf_NS_pred = pd.DataFrame({"Prediction": predictions_NS, "Actual": y_test_NS})
clf_NS_pred.head()

In [None]:
# T-F
clf_TF = svm.LinearSVC(random_state=0, tol=1e-6, max_iter=2000, C=0.79)
clf_TF.fit(X_train_TF, y_train_TF)

# Predict outcomes for T-F test data set
predictions_TF = clf_TF.predict(X_test_TF)
clf_TF_pred = pd.DataFrame({"Prediction": predictions_TF, "Actual": y_test_TF})
clf_TF_pred.head()

In [None]:
# J-P
clf_JP = svm.LinearSVC(random_state=0, tol=1e-6, max_iter=2000, C=0.79)
clf_JP.fit(X_train_JP, y_train_JP)

# Predict outcomes for J-P test data set
predictions_JP = clf_JP.predict(X_test_JP)
clf_JP_pred = pd.DataFrame({"Prediction": predictions_JP, "Actual": y_test_JP})
clf_JP_pred.head()

### Accuracies for 4 Models

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
accEI = clf_EI.score(X_test_EI, y_test_EI)
accNS = clf_NS.score(X_test_NS, y_test_NS)
accTF = clf_TF.score(X_test_TF, y_test_TF)
accJP = clf_JP.score(X_test_JP, y_test_JP)
print("Accuracy of model at predicting MB types were: \n EI:{accEI} \n NS:{accNS} \n TF: {accTF} \n JP: {accJP}".format(accEI=accEI, accNS=accNS, accTF=accTF, accJP=accJP ))

In [None]:
print("E-I SVM Model")
print("Classification Report: \n")
print(classification_report(clf_EI_pred.Actual, clf_EI_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_EI_pred.Actual, clf_EI_pred.Prediction))

In [None]:
print("N-S SVM Model")
print("Classification Report: \n")
print(classification_report(clf_NS_pred.Actual, clf_NS_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_NS_pred.Actual, clf_NS_pred.Prediction))

In [None]:
print("T-F SVM Model")
print("Classification Report: \n")
print(classification_report(clf_TF_pred.Actual, clf_TF_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_TF_pred.Actual, clf_TF_pred.Prediction))

In [None]:
print("J-P SVM Model")
print("Classification Report: \n")
print(classification_report(clf_JP_pred.Actual, clf_JP_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_JP_pred.Actual, clf_JP_pred.Prediction))