# Setep 5: Machine Learning Model

<br>
Clean up any values left from any previous steps

In [1]:
#Quick clean up
for name in dir():
    if not name.startswith('_'): # and name not in ['mbti_FE','mbti_Dataset', 'full_Lem_CV', 'full_Lem_Ngram', 'full_Lem_tfidf']:
        del globals()[name]

Load Dataset and results from previous steps.

In [2]:
#Load information from prevous steps
import pandas as pd
import numpy as np
from scipy import sparse

mbti_Dataset = pd.read_csv('mbti_Dataset.csv')
mbti_FE = pd.read_csv('mbti_FE.csv')

full_Lem_CV = sparse.load_npz('full_Lem_CV.npz')
#full_Lem_Ngram = sparse.load_npz('full_Lem_Ngram.npz')
full_Lem_tfidf = sparse.load_npz('full_Lem_tfidf.npz')
features_Dic = {'Count Vectorizer': sparse.load_npz('full_Lem_CV.npz'), 
                'TFIDF Vectorizer': sparse.load_npz('full_Lem_tfidf.npz')}

#np.save('full_Lem_CV', full_Lem_CV.toarray())
#np.save('f:/full_Lem_Ngram', full_Lem_Ngram.toarray())
#np.save('full_Lem_tfidf', full_Lem_tfidf.toarray())

## 5-1: Random Forest Model
### 5-1-1: Random Forest with Holdout test set

In [7]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

#additonal_Features = ['No_Characters', 'No_Words', 'No_Char-Capital', 'No_Words-Capital', 'No_Punctuations', 'No_WordsInQuotes', 'No_Sentences', 'No_UniqueWords', 'No_Stopwords', 'Avg_WordLength', 'Avg_SentLength', 'UniqueWrd_vs_NoWrd', 'Stopwords_vs_NoWrd','Sentiment_Score']

classes = ['IE' , 'NS', 'FT', 'PJ']

def rfClassifier_HoldhoutSet(X_Features, dataset_PD, test_Szie, Lable, feature_type , is_print = True):
    if Lable == 'IE':
        Lable = 'I'
        predict_Lable = 'Introverts'
    elif Lable == 'NS':
        Lable = 'N'
        predict_Lable = 'Intuitives'
    elif Lable == 'FT':
        Lable = 'F'
        predict_Lable = 'Feelers'
    elif Lable == 'PJ':
        Lable = 'P'
        predict_Lable = 'Perceivers'
    X_train, X_test, Y_train, Y_test = train_test_split(X_Features, dataset_PD, test_size=test_Szie)#20% of dataset for test set
    rf = RandomForestClassifier(n_estimators=50, max_depth=20 ,n_jobs=-1)#Max depth of tree is 20
    rf_model = rf.fit(X_train, Y_train)
    Y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(Y_test, Y_pred, pos_label=Lable, average='binary')
    if is_print:
        print('Results for predicting being ' + predict_Lable + ' using ' + feature_type,
        '--- Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3), 
                                                        round((Y_pred==Y_test).sum() / len(Y_pred),3)))
    return([precision, recall, fscore, support])


for item in classes:
    for key, X_Features in features_Dic.items():
        X_Features = X_Features.toarray()
        X_Features = pd.DataFrame(X_Features)
        rfClassifier_HoldhoutSet(X_Features, mbti_Dataset[item], 0.2, item, key)
        print('\n')

Results for predicting being Introverts using Count Vectorizer --- Precision: 0.769 / Recall: 1.0 / Accuracy: 0.769


Results for predicting being Introverts using TFIDF Vectorizer --- Precision: 0.751 / Recall: 1.0 / Accuracy: 0.751


Results for predicting being Intuitives using Count Vectorizer --- Precision: 0.871 / Recall: 1.0 / Accuracy: 0.871


Results for predicting being Intuitives using TFIDF Vectorizer --- Precision: 0.868 / Recall: 1.0 / Accuracy: 0.868


Results for predicting being Feelers using Count Vectorizer --- Precision: 0.688 / Recall: 0.876 / Accuracy: 0.707


Results for predicting being Feelers using TFIDF Vectorizer --- Precision: 0.669 / Recall: 0.905 / Accuracy: 0.707


Results for predicting being Perceivers using Count Vectorizer --- Precision: 0.601 / Recall: 0.998 / Accuracy: 0.603


Results for predicting being Perceivers using TFIDF Vectorizer --- Precision: 0.608 / Recall: 0.99 / Accuracy: 0.607




### 5-1-2: Explorering Random Forest with Holdout test set + grid-search

In [8]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

classes = ['IE' , 'NS', 'FT', 'PJ']


def rfClassifier_GridSearch(X_Features, dataset_PD, test_Szie, Lable, feature_type, n_est, depth, is_print = True,):
    if Lable == 'IE':
        Lable = 'I'
        predict_Lable = 'Introverts'
    elif Lable == 'NS':
        Lable = 'N'
        predict_Lable = 'Intuitives'
    elif Lable == 'FT':
        Lable = 'F'
        predict_Lable = 'Feelers'
    elif Lable == 'PJ':
        Lable = 'P'
        predict_Lable = 'Perceivers'

    X_train, X_test, Y_train, Y_test = train_test_split(X_Features, dataset_PD, test_size=test_Szie)#20% of dataset is test set
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth ,n_jobs=-1)#Max depth of tree is 20
    rf_model = rf.fit(X_train, Y_train)
    Y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(Y_test, Y_pred, pos_label=Lable, average='binary')
    if is_print:
        print('Results for predicting being ' + predict_Lable + ' using ' + feature_type + ' for'  ,
        '{} Estimators / Max Depth of {} ----- Precision: {} / Recall: {} / Accuracy: {}'.format(
                                                        n_est,
                                                        depth,
                                                        round(precision, 3),
                                                        round(recall, 3), 
                                                        round((Y_pred==Y_test).sum() / len(Y_pred),3)))
    return([precision, recall, fscore, support])


for item in classes:
    for key, X_Features in features_Dic.items():
        for n_est in [10, 50, 100]:
            for depth in [10, 20, 30, None]:
                rfClassifier_GridSearch(X_Features, mbti_Dataset[item], 0.2, item, key, n_est, depth)
                print('\n')


Results for predicting being Introverts using Count Vectorizer for 10 Estimators / Max Depth of 10 ----- Precision: 0.766 / Recall: 1.0 / Accuracy: 0.766


Results for predicting being Introverts using Count Vectorizer for 10 Estimators / Max Depth of 20 ----- Precision: 0.768 / Recall: 1.0 / Accuracy: 0.769


Results for predicting being Introverts using Count Vectorizer for 10 Estimators / Max Depth of 30 ----- Precision: 0.779 / Recall: 0.996 / Accuracy: 0.778


Results for predicting being Introverts using Count Vectorizer for 10 Estimators / Max Depth of None ----- Precision: 0.766 / Recall: 0.982 / Accuracy: 0.759


Results for predicting being Introverts using Count Vectorizer for 50 Estimators / Max Depth of 10 ----- Precision: 0.752 / Recall: 1.0 / Accuracy: 0.752


Results for predicting being Introverts using Count Vectorizer for 50 Estimators / Max Depth of 20 ----- Precision: 0.774 / Recall: 1.0 / Accuracy: 0.774


Results for predicting being Introverts using Count Vector

### 5-1-3: Evaluation Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

pd.set_option('display.max_colwidth', 500)

def RF_Evaluation(param, X_Features, dataset_PD, lable, is_print = True):
    rf = RandomForestClassifier()
    gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)#cv=5 means 5 folde validation
    gs_fit = gs.fit(X_Features, dataset_PD)
    if is_print:
        print(pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[['param_max_depth',
        'param_n_estimators', 'std_test_score', 'mean_test_score', 'rank_test_score']][0:5])

    return(pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[['param_max_depth',
    'param_n_estimators', 'std_test_score', 'mean_test_score', 'rank_test_score']][0:5])


param = {'n_estimators' : [10, 150, 300],
        'max_depth' : [30, 60, 90, None]}

classes = ['IE' , 'NS', 'FT', 'PJ']
for item in classes:
    for key, X_Features in features_Dic.items():
        RF_Evaluation(param, X_Features, mbti_Dataset[item], item)
        print(key)


   param_max_depth param_n_estimators  std_test_score  mean_test_score  \
6               90                 10        0.004380         0.771873   
0               30                 10        0.001094         0.770605   
10            None                150        0.000565         0.770490   
11            None                300        0.000461         0.770375   
7               90                150        0.000461         0.770259   

    rank_test_score  
6                 1  
0                 2  
10                3  
11                4  
7                 5  
Count Vectorizer
   param_max_depth param_n_estimators  std_test_score  mean_test_score  \
10            None                150        0.000936         0.770259   
8               90                300        0.000431         0.770144   
11            None                300        0.000672         0.770144   
4               60                150        0.000365         0.770029   
5               60                30

## 5-2: Gradient Boosting Model
### 5-2-1: Gradient Boosting with Holdout test set

In [16]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier


def GBoosting_GridSearch(X_Features, dataset_PD, test_Szie, Lable, feature_type , n_est, depth, lr ,is_print = True,):
    if Lable == 'IE':
        Lable = 'I'
        predict_Lable = 'Introverts'
    elif Lable == 'NS':
        Lable = 'N'
        predict_Lable = 'Intuitives'
    elif Lable == 'FT':
        Lable = 'F'
        predict_Lable = 'Feelers'
    elif Lable == 'PJ':
        Lable = 'P'
        predict_Lable = 'Perceivers'
    X_train, X_test, Y_train, Y_test = train_test_split(X_Features, dataset_PD, test_size=test_Szie) # 20% of our dataset is test set
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth , learning_rate=lr)#Max depth of tree is 20
    gb_model = gb.fit(X_train, Y_train)
    Y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(Y_test, Y_pred, pos_label=Lable, average='binary')
    if is_print:
        print('Being ' + predict_Lable + ' using ' + feature_type + ' : '  ,'Estimators {} / Max_Depth {} / LearningRate : {} -----> Precision: {} / Recall: {} / Accuracy: {}'.format(
                                                        n_est,
                                                        depth,
                                                        lr,
                                                        round(precision, 3),
                                                        round(recall, 3), 
                                                        round((Y_pred==Y_test).sum() / len(Y_pred),3)))
    return([precision, recall, fscore, support])



classes = ['IE' , 'NS', 'FT', 'PJ']

for item in classes:
    for n_est in [50, 100, 150]:
        for depth in [3, 7, 11, 15]:
            for lr in [0.01, 0.1, 1]:
                GBoosting_GridSearch(X_Features, mbti_Dataset[item], 0.2, item, key, n_est, depth, lr)


Results for predicting being Introverts using TFIDF Vectorizer for  50 Estimators / Max Depth of 3 and LearningRate : 0.01 ----- Precision: 0.77 / Recall: 1.0 / Accuracy: 0.77
Results for predicting being Introverts using TFIDF Vectorizer for  50 Estimators / Max Depth of 3 and LearningRate : 0.1 ----- Precision: 0.817 / Recall: 0.966 / Accuracy: 0.808
Results for predicting being Introverts using TFIDF Vectorizer for  50 Estimators / Max Depth of 3 and LearningRate : 1 ----- Precision: 0.831 / Recall: 0.891 / Accuracy: 0.777
Results for predicting being Introverts using TFIDF Vectorizer for  50 Estimators / Max Depth of 7 and LearningRate : 0.01 ----- Precision: 0.763 / Recall: 0.999 / Accuracy: 0.763
Results for predicting being Introverts using TFIDF Vectorizer for  50 Estimators / Max Depth of 7 and LearningRate : 0.1 ----- Precision: 0.832 / Recall: 0.955 / Accuracy: 0.817
Results for predicting being Introverts using TFIDF Vectorizer for  50 Estimators / Max Depth of 7 and Learni

KeyboardInterrupt: 

### 5-2-2: Evaluation Gradient Boosting Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

pd.set_option('display.max_colwidth', 500)


def GB_Evaluation(param, X_Features, dataset_PD, lable):
    gb = GradientBoostingClassifier()
    gs = GridSearchCV(gb, param, cv=5)#cv=5 meand 5 folde validation
    gs_fit = gs.fit(X_Features, dataset_PD)
    print(pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[['param_max_depth',
     'param_n_estimators', 'std_test_score', 'mean_test_score', 'rank_test_score']][0:5])

    return(pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[['param_max_depth',
     'param_n_estimators', 'std_test_score', 'mean_test_score', 'rank_test_score']][0:5])
  
param = {'n_estimators' : [50, 100, 150],
                'max_depth' : [7, 11, 15],
                'Learning_rate' : [0.1]}

classes = ['IE' , 'NS', 'FT', 'PJ']


for item in classes:
    for key, X_Features in features_Dic.items():
        GB_Evaluation(param, X_Features, mbti_Dataset[item], item)
        print(key)