## Setep 5: Machine Learning Model

### 5-1: Random Forest Model
<br>
1- Can be used for Classification or Regression<br>
2- Easily handles outliers, missing values, etc.<br>
3- Accepts various types of inputs (continues, ordinal, ...)<br>
4- Less likely to overfit<br>
5- Outputs feature importance<br>



In [25]:
#Load information from prevous steps

import pandas as pd
import numpy as np
from scipy import sparse

mbti_Dataset = pd.read_csv('mbti_Dataset.csv')
mbti_FE = pd.read_csv('mbti_FE.csv')

full_Lem_CV = sparse.load_npz('full_Lem_CV.npz')
full_Lem_Ngram = sparse.load_npz('full_Lem_Ngram.npz')
full_Lem_tfidf = sparse.load_npz('full_Lem_tfidf.npz')

In [26]:
#Quick clean up
for name in dir():
    if not name.startswith('_') and name not in ['mbti_FE','mbti_Dataset', 'full_Lem_CV', 'full_Lem_Ngram', 'full_Lem_tfidf']:
        del globals()[name]

In [27]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pandas as pd

rf = RandomForestClassifier(n_jobs=-1) #n_jobs=-1 allow us to create trees in parallel
k_Fold = KFold(n_splits=5)

#additonal_Features = ['No_Characters', 'No_Words', 'No_Char-Capital', 'No_Words-Capital', 'No_Punctuations', 'No_WordsInQuotes', 'No_Sentences', 'No_UniqueWords', 'No_Stopwords', 'Avg_WordLength', 'Avg_SentLength', 'UniqueWrd_vs_NoWrd', 'Stopwords_vs_NoWrd','Sentiment_Score']
vectorizer_List = {'Count Vectorization' : full_Lem_CV, 'N-gram Vectorizing' : full_Lem_Ngram, 'TFIDF' : full_Lem_tfidf}

for title, vect in vectorizer_List.items():
    if title == 'N-gram Vectorizing': #Ignore N-gram Vectorizing for now(low memory)
        continue
    #X_Features = pd.concat( [mbti_FE[additonal_Features], pd.DataFrame(vect)], axis=1)
    #print(title)
    X_Features = pd.DataFrame(vect.toarray())
    print(title + ' for IE', cross_val_score(rf, vect, mbti_Dataset['IE'], cv=k_Fold, scoring='accuracy', n_jobs=-1))
    print(title + ' for NS', cross_val_score(rf, vect, mbti_Dataset['NS'], cv=k_Fold, scoring='accuracy', n_jobs=-1))
    print(title + ' for FT', cross_val_score(rf, vect, mbti_Dataset['FT'], cv=k_Fold, scoring='accuracy', n_jobs=-1))
    print(title + ' for PJ', cross_val_score(rf, vect, mbti_Dataset['PJ'], cv=k_Fold, scoring='accuracy', n_jobs=-1))

Count Vectorization
Count Vectorization for IE [0.7740634  0.77233429 0.75677233 0.77694524 0.77175793]
Count Vectorization for NS [0.86801153 0.85475504 0.86916427 0.85994236 0.85878963]
Count Vectorization for FT [0.73256484 0.72391931 0.71873199 0.73198847 0.71585014]
Count Vectorization for PJ [0.63573487 0.60691643 0.62420749 0.63688761 0.62074928]
TFIDF
TFIDF for IE [0.7740634  0.77291066 0.75677233 0.77752161 0.77060519]
TFIDF for NS [0.86801153 0.85475504 0.86916427 0.85994236 0.85821326]
TFIDF for FT [0.71700288 0.71642651 0.72103746 0.71873199 0.7314121 ]
TFIDF for PJ [0.63861671 0.60691643 0.61786744 0.63804035 0.61902017]


In [28]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

X_Features = full_Lem_CV.toarray()
X_Features = pd.DataFrame(X_Features)

X_train, X_test, Y_train, Y_test = train_test_split(X_Features, mbti_Dataset['IE'], test_size=0.2) # 20% of our dataset is test set
rf = RandomForestClassifier(n_estimators=50, max_depth=20 ,n_jobs=-1)#Max depth of tree is 20
rf_model = rf.fit(X_train, Y_train)
#sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10] #List top 10 feature importances

Y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(Y_test, Y_pred, pos_label='I', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), 
                                                        round(recall, 3), 
                                                        round((Y_pred==Y_test).sum() / len(Y_pred),3)))


Precision: 0.763 / Recall: 1.0 / Accuracy: 0.763
