# Myers-Briggs ML Project: Random Forest Model

### Imports & Dependencies

In [43]:
import pandas as pd
import numpy as np

In [44]:
# import the cleaned df
cleaned_df = pd.read_csv('../../Resources/data/cleaned_mbti.csv', index_col=0)
cleaned_df.head()

Unnamed: 0,type,posts,http_count,no_url,text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,24,' and intj moments sportscenter not top ten...,' and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,10,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,5,"'Good one _____ course, to which I say I k...","'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",2,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,6,'You're fired.|||That's another silly misconce...,'You're fired. That's another silly misconcept...


### Split type column to 4 combinations

In [45]:
# Split types into four binary columns
split_df = cleaned_df[['type','text']].copy()
split_df['E-I'] = split_df['type'].str.extract('(.)[N,S]',1)
split_df['N-S'] = split_df['type'].str.extract('[E,I](.)[F,T]',1)
split_df['T-F'] = split_df['type'].str.extract('[N,S](.)[J,P]',1)
split_df['J-P'] = split_df['type'].str.extract('[F,T](.)',1)
split_df.head()

Unnamed: 0,type,text,E-I,N-S,T-F,J-P
0,INFJ,' and intj moments sportscenter not top ten...,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts ver...,E,N,T,P
2,INTP,"'Good one _____ course, to which I say I k...",I,N,T,P
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",I,N,T,J
4,ENTJ,'You're fired. That's another silly misconcept...,E,N,T,J


In [46]:
# Encode letters to numeric values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

encoded_df = cleaned_df[['type','text']].copy()
encoded_df['E0-I1'] = le.fit_transform(split_df['E-I'])
encoded_df['N0-S1'] = le.fit_transform(split_df['N-S'])
encoded_df['F0-T1'] = le.fit_transform(split_df['T-F'])
encoded_df['J0-P1'] = le.fit_transform(split_df['J-P'])

encoded_df.head()

Unnamed: 0,type,text,E0-I1,N0-S1,F0-T1,J0-P1
0,INFJ,' and intj moments sportscenter not top ten...,1,0,0,0
1,ENTP,'I'm finding the lack of me in these posts ver...,0,0,1,1
2,INTP,"'Good one _____ course, to which I say I k...",1,0,1,1
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,0,1,0
4,ENTJ,'You're fired. That's another silly misconcept...,0,0,1,0


In [47]:
encoded_df.columns

Index(['type', 'text', 'E0-I1', 'N0-S1', 'F0-T1', 'J0-P1'], dtype='object')

### TFIDF Vectorizer

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [49]:
vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words=None,
    ngram_range=(1,3),
)

X = cleaned_df["text"].values
y_all = encoded_df.drop(columns=['type', 'text'])
X_train, X_test, y_all_train, y_all_test = train_test_split(X, y_all, random_state=0)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestRegressor(n_estimators=1000, random_state=0)

### Extraversion-Introversion (E-I)

In [None]:
# Fit E-I combination
y_EI_train = y_all_train['E0-I1']
y_EI_test = y_all_test['E0-I1']

classifier.fit(X_train, y_EI_train)

In [None]:
# Predict outcomes for test data set
y_EI_pred = classifier.predict(X_test)
EI_result = pd.DataFrame({"Prediction": y_EI_pred, "Actual": y_EI_test})
EI_result.head()

### Intuition-Sensing (N-S)

In [None]:
# Fit N-S combination
y_NS_train = y_all_train['N0-S1']
y_NS_test = y_all_test['N0-S1']

classifier.fit(X_train, y_NS_train)

In [None]:
# Predict outcomes for test data set
y_NS_pred = classifier.predict(X_test)
NS_result = pd.DataFrame({"Prediction": y_NS_pred, "Actual": y_NS_test})
NS_result.head()

### Feeling-Thinking (F-T)

In [None]:
# Fit F-T combination
y_FT_train = y_all_train['F0-T1']
y_FT_test = y_all_test['F0-T1']

classifier.fit(X_train, y_FT_train)

In [None]:
# Predict outcomes for test data set
y_FT_pred = classifier.predict(X_test)
FT_result = pd.DataFrame({"Prediction": y_FT_pred, "Actual": y_FT_test})
FT_result.head()

### Judging-Perception (J-P)

In [None]:
# Fit J-P combination
y_JP_train = y_all_train['J0-P1']
y_JP_test = y_all_test['J0-P1']

classifier.fit(X_train, y_JP_train)

In [None]:
# Predict outcomes for test data set
y_JP_pred = classifier.predict(X_test)
JP_result = pd.DataFrame({"Prediction": y_JP_pred, "Actual": y_JP_test})
JP_result.head()

### Model Validation

In [None]:
# Calculate accuracy score for each group
from sklearn.metrics import accuracy_score
print(f" Random Forest model accuracy for E-I: {accuracy_score(y_EI_test, y_EI_pred):.3f}")
print(f" Random Forest model accuracy for N-S: {accuracy_score(y_NS_test, y_NS_pred):.3f}")
print(f" Random Forest model accuracy for F-T: {accuracy_score(y_FT_test, y_FT_pred):.3f}")
print(f" Random Forest model accuracy for J-P: {accuracy_score(y_JP_test, y_JP_pred):.3f}")

In [None]:
# Display classification report
from sklearn.metrics import confusion_matrix, classification_report

report_EI = classification_report(y_EI_test, y_EI_pred)
print(f"Classification report for E0-I1 group:")
print(report_EI)
print("--------------------------")

report_NS = classification_report(y_NS_test, y_NS_pred)
print(f"Classification report for N0-S1 group:")
print(report_NS)
print("--------------------------")

report_FT = classification_report(y_FT_test, y_FT_pred)
print(f"Classification report for F0-T1 group:")
print(report_FT)
print("--------------------------")

report_JP = classification_report(y_JP_test, y_JP_pred)
print(f"Classification report for J0-P1 group:")
print(report_JP)