# Myers Brigss ML Project: RandomForest Model

### Imports & Dependencies

In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
# import the cleaned df
cleaned_df = pd.read_csv('../../Resources/data/cleaned_mbti.csv', index_col=0)
cleaned_df.head()

Unnamed: 0,type,posts,http_count,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,24,' and intj moments sportscenter not top ten...,and intj moments sportscenter not top ten p...
1,ENTP,'I'm finding the lack of me in these posts ver...,10,'I'm finding the lack of me in these posts ver...,i m finding the lack of me in these posts very...
2,INTP,'Good one _____ https://www.youtube.com/wat...,5,"'Good one _____ course, to which I say I k...",good one course to which i say i kn...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",2,"'Dear INTP, I enjoyed our conversation the o...",dear intp i enjoyed our conversation the ot...
4,ENTJ,'You're fired.|||That's another silly misconce...,6,'You're fired.|||That's another silly misconce...,you re fired that s another silly misconcepti...


### Split into 4 dataframes for each boolean dimension

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [4]:
E_I_df = cleaned_df.copy()
E_I_df['E-I'] = cleaned_df["type"].str.extract('(.)[N,S]',1)
E_I_df['Encoded'] = le.fit_transform(E_I_df['E-I'])
E_I_df = E_I_df[['Encoded', 'E-I', 'type', 'text']]
E_I_df.head()

Unnamed: 0,Encoded,E-I,type,text
0,1,I,INFJ,and intj moments sportscenter not top ten p...
1,0,E,ENTP,i m finding the lack of me in these posts very...
2,1,I,INTP,good one course to which i say i kn...
3,1,I,INTJ,dear intp i enjoyed our conversation the ot...
4,0,E,ENTJ,you re fired that s another silly misconcepti...


In [5]:
N_S_df = cleaned_df.copy()
N_S_df['N-S'] = cleaned_df["type"].str.extract('[E,I](.)[F,T]',1)
N_S_df['Encoded'] = le.fit_transform(N_S_df['N-S'])
N_S_df = N_S_df[['Encoded', 'N-S', 'type', 'text']]
N_S_df.head()

Unnamed: 0,Encoded,N-S,type,text
0,0,N,INFJ,and intj moments sportscenter not top ten p...
1,0,N,ENTP,i m finding the lack of me in these posts very...
2,0,N,INTP,good one course to which i say i kn...
3,0,N,INTJ,dear intp i enjoyed our conversation the ot...
4,0,N,ENTJ,you re fired that s another silly misconcepti...


In [6]:
T_F_df = cleaned_df.copy()
T_F_df['T-F'] = cleaned_df["type"].str.extract('[N,S](.)[J,P]',1)
T_F_df['Encoded'] = le.fit_transform(T_F_df['T-F'])
T_F_df = T_F_df[['Encoded', 'T-F', 'type', 'text']]
T_F_df.head()

Unnamed: 0,Encoded,T-F,type,text
0,0,F,INFJ,and intj moments sportscenter not top ten p...
1,1,T,ENTP,i m finding the lack of me in these posts very...
2,1,T,INTP,good one course to which i say i kn...
3,1,T,INTJ,dear intp i enjoyed our conversation the ot...
4,1,T,ENTJ,you re fired that s another silly misconcepti...


In [7]:
J_P_df = cleaned_df.copy()
J_P_df['J-P'] = cleaned_df["type"].str.extract('[F,T](.)',1)
J_P_df['Encoded'] = le.fit_transform(J_P_df['J-P'])
J_P_df = J_P_df[['Encoded', 'J-P', 'type','text']]
J_P_df.head()

Unnamed: 0,Encoded,J-P,type,text
0,0,J,INFJ,and intj moments sportscenter not top ten p...
1,1,P,ENTP,i m finding the lack of me in these posts very...
2,1,P,INTP,good one course to which i say i kn...
3,0,J,INTJ,dear intp i enjoyed our conversation the ot...
4,0,J,ENTJ,you re fired that s another silly misconcepti...


### TFIDF Vectorizer for 4 dimensions

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [9]:
#Setting up Vectorizer
vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words="english",
    ngram_range=(1,3),
)

In [10]:
# E-I
X = E_I_df["text"].values
y = E_I_df["Encoded"].values
X_train_EI, X_test_EI, y_train_EI, y_test_EI = train_test_split(X, y, random_state=42)
X_train_EI = vectorizer.fit_transform(X_train_EI)
X_test_EI = vectorizer.transform(X_test_EI)

In [11]:
# N-S
X = N_S_df["text"].values
y = N_S_df["Encoded"].values
X_train_NS, X_test_NS, y_train_NS, y_test_NS = train_test_split(X, y, random_state=42)
X_train_NS = vectorizer.fit_transform(X_train_NS)
X_test_NS = vectorizer.transform(X_test_NS)

In [12]:
# T-F
X = T_F_df["text"].values
y = T_F_df["Encoded"].values
X_train_TF, X_test_TF, y_train_TF, y_test_TF = train_test_split(X, y, random_state=42)
X_train_TF = vectorizer.fit_transform(X_train_TF)
X_test_TF = vectorizer.transform(X_test_TF)

In [13]:
# J-P
X = J_P_df["text"].values
y = J_P_df["Encoded"].values
X_train_JP, X_test_JP, y_train_JP, y_test_JP = train_test_split(X, y, random_state=42)
X_train_JP = vectorizer.fit_transform(X_train_JP)
X_test_JP = vectorizer.transform(X_test_JP)

### 4 Model for 4 dimensions

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [15]:
# E-I
clf_EI = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_EI.fit(X_train_EI, y_train_EI)

# Predict outcomes for E-I test data set
predictions_EI = clf_EI.predict(X_test_EI)
clf_EI_pred = pd.DataFrame({"Prediction": predictions_EI, "Actual": y_test_EI})
clf_EI_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,0
4,1,0


In [16]:
# N-S
clf_NS = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_NS.fit(X_train_NS, y_train_NS)

# Predict outcomes for N-S test data set
predictions_NS = clf_NS.predict(X_test_NS)
clf_NS_pred = pd.DataFrame({"Prediction": predictions_NS, "Actual": y_test_NS})
clf_NS_pred.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [17]:
# T-F
clf_TF = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_TF.fit(X_train_TF, y_train_TF)

# Predict outcomes for T-F test data set
predictions_TF = clf_TF.predict(X_test_TF)
clf_TF_pred = pd.DataFrame({"Prediction": predictions_TF, "Actual": y_test_TF})
clf_TF_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,1
2,1,1
3,0,0
4,1,1


In [18]:
# J-P
clf_JP = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_JP.fit(X_train_JP, y_train_JP)

# Predict outcomes for J-P test data set
predictions_JP = clf_JP.predict(X_test_JP)
clf_JP_pred = pd.DataFrame({"Prediction": predictions_JP, "Actual": y_test_JP})
clf_JP_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,1,1
4,0,0


### Accuracies for 4 Models

In [19]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [20]:
accEI = clf_EI.score(X_test_EI, y_test_EI)
accNS = clf_NS.score(X_test_NS, y_test_NS)
accTF = clf_TF.score(X_test_TF, y_test_TF)
accJP = clf_JP.score(X_test_JP, y_test_JP)
print("Accuracy of model at predicting MB types were: \n EI:{accEI} \n NS:{accNS} \n TF: {accTF} \n JP: {accJP}".format(accEI=accEI, accNS=accNS, accTF=accTF, accJP=accJP ))

Accuracy of model at predicting MB types were: 
 EI:0.8049792531120332 
 NS:0.859843245735362 
 TF: 0.8331028123559244 
 JP: 0.7381281696634394


In [21]:
print("E-I RForest Model")
print("Classification Report: \n")
print(classification_report(clf_EI_pred.Actual, clf_EI_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_EI_pred.Actual, clf_EI_pred.Prediction))

E-I RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.98      0.11      0.19       473
           1       0.80      1.00      0.89      1696

    accuracy                           0.80      2169
   macro avg       0.89      0.55      0.54      2169
weighted avg       0.84      0.80      0.74      2169

Confusion Matrix: 

[[  51  422]
 [   1 1695]]


In [22]:
print("N-S RForest Model")
print("Classification Report: \n")
print(classification_report(clf_NS_pred.Actual, clf_NS_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_NS_pred.Actual, clf_NS_pred.Prediction))

N-S RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      1862
           1       1.00      0.01      0.02       307

    accuracy                           0.86      2169
   macro avg       0.93      0.50      0.47      2169
weighted avg       0.88      0.86      0.80      2169

Confusion Matrix: 

[[1862    0]
 [ 304    3]]


In [23]:
print("T-F RForest Model")
print("Classification Report: \n")
print(classification_report(clf_TF_pred.Actual, clf_TF_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_TF_pred.Actual, clf_TF_pred.Prediction))

T-F RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1179
           1       0.85      0.77      0.81       990

    accuracy                           0.83      2169
   macro avg       0.84      0.83      0.83      2169
weighted avg       0.83      0.83      0.83      2169

Confusion Matrix: 

[[1045  134]
 [ 228  762]]


In [24]:
print("J-P RForest Model")
print("Classification Report: \n")
print(classification_report(clf_JP_pred.Actual, clf_JP_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_JP_pred.Actual, clf_JP_pred.Prediction))

J-P RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.90      0.37      0.52       842
           1       0.71      0.97      0.82      1327

    accuracy                           0.74      2169
   macro avg       0.80      0.67      0.67      2169
weighted avg       0.78      0.74      0.70      2169

Confusion Matrix: 

[[ 308  534]
 [  34 1293]]


In [25]:
##Random Forest with Oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled_EI, y_resampled_EI = ros.fit_resample(X_train_EI, y_train_EI)

clf_EI = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_EI.fit(X_resampled_EI, y_resampled_EI)

predictions_EI = clf_EI.predict(X_test_EI)
clf_EI_pred = pd.DataFrame({"Prediction": predictions_EI, "Actual": y_test_EI})
clf_EI_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,0
4,1,0


In [26]:
ros = RandomOverSampler(random_state=1)
X_resampled_NS, y_resampled_NS = ros.fit_resample(X_train_NS, y_train_NS)

clf_NS = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_NS.fit(X_resampled_NS, y_resampled_NS)

predictions_NS = clf_NS.predict(X_test_NS)
clf_NS_pred = pd.DataFrame({"Prediction": predictions_NS, "Actual": y_test_NS})
clf_NS_pred.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [27]:
ros = RandomOverSampler(random_state=1)
X_resampled_TF, y_resampled_TF = ros.fit_resample(X_train_TF, y_train_TF)

clf_TF = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_TF.fit(X_resampled_TF, y_resampled_TF)

predictioTF_TF = clf_TF.predict(X_test_TF)
clf_TF_pred = pd.DataFrame({"Prediction": predictioTF_TF, "Actual": y_test_TF})
clf_TF_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,1
2,1,1
3,0,0
4,1,1


In [28]:
ros = RandomOverSampler(random_state=1)
X_resampled_JP, y_resampled_JP = ros.fit_resample(X_train_JP, y_train_JP)

clf_JP = RandomForestClassifier(n_estimators=2000, max_depth=None, max_features='auto',random_state=0)
clf_JP.fit(X_resampled_JP, y_resampled_JP)

predictioJP_JP = clf_JP.predict(X_test_JP)
clf_JP_pred = pd.DataFrame({"Prediction": predictioJP_JP, "Actual": y_test_JP})
clf_JP_pred.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,1,1
4,0,0


In [29]:
accEI = clf_EI.score(X_test_EI, y_test_EI)
accNS = clf_NS.score(X_test_NS, y_test_NS)
accTF = clf_TF.score(X_test_TF, y_test_TF)
accJP = clf_JP.score(X_test_JP, y_test_JP)
print("Accuracy of oversampled model at predicting MB types were: \n EI:{accEI} \n NS:{accNS} \n TF: {accTF} \n JP: {accJP}".format(accEI=accEI, accNS=accNS, accTF=accTF, accJP=accJP ))

Accuracy of oversampled model at predicting MB types were: 
 EI:0.8086675887505763 
 NS:0.8570769940064545 
 TF: 0.8395573997233748 
 JP: 0.7597971415398801


In [30]:
print("E-I RForest Model")
print("Classification Report: \n")
print(classification_report(clf_EI_pred.Actual, clf_EI_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_EI_pred.Actual, clf_EI_pred.Prediction))

E-I RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.89      0.14      0.24       473
           1       0.81      1.00      0.89      1696

    accuracy                           0.81      2169
   macro avg       0.85      0.57      0.57      2169
weighted avg       0.82      0.81      0.75      2169

Confusion Matrix: 

[[  66  407]
 [   8 1688]]


In [31]:
print("N-S RForest Model")
print("Classification Report: \n")
print(classification_report(clf_NS_pred.Actual, clf_NS_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_NS_pred.Actual, clf_NS_pred.Prediction))

N-S RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      1862
           1       0.43      0.03      0.05       307

    accuracy                           0.86      2169
   macro avg       0.64      0.51      0.49      2169
weighted avg       0.80      0.86      0.80      2169

Confusion Matrix: 

[[1850   12]
 [ 298    9]]


In [32]:
print("T-F RForest Model")
print("Classification Report: \n")
print(classification_report(clf_TF_pred.Actual, clf_TF_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_TF_pred.Actual, clf_TF_pred.Prediction))

T-F RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1179
           1       0.84      0.81      0.82       990

    accuracy                           0.84      2169
   macro avg       0.84      0.84      0.84      2169
weighted avg       0.84      0.84      0.84      2169

Confusion Matrix: 

[[1021  158]
 [ 190  800]]


In [33]:
print("J-P RForest Model")
print("Classification Report: \n")
print(classification_report(clf_JP_pred.Actual, clf_JP_pred.Prediction))
print("Confusion Matrix: \n")
print(confusion_matrix(clf_JP_pred.Actual, clf_JP_pred.Prediction))

J-P RForest Model
Classification Report: 

              precision    recall  f1-score   support

           0       0.84      0.47      0.60       842
           1       0.74      0.94      0.83      1327

    accuracy                           0.76      2169
   macro avg       0.79      0.71      0.72      2169
weighted avg       0.78      0.76      0.74      2169

Confusion Matrix: 

[[ 395  447]
 [  74 1253]]
