<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

## Imports

In [None]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import yellowbrick
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.features import RadViz


# Text Processing
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from collections import Counter

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import sklearn.cluster as cluster

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

import pickle as pkl
from scipy import sparse
from numpy import asarray
from numpy import savetxt

# Fix imbalance
from imblearn.under_sampling import InstanceHardnessThreshold

# Model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score

#Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import classification_report
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

## 3. Model building and evaluation: Machine Learning

From the 6 datasets created in the previous notebook, I will train few models by combining different algorithms (`GaussianNB`, `LogisticRegression`, `KNeighborsClassifier`, `DecisionTreeClassifier`, `RandomForestClassifier`, `GradientBoostingClassifier` and `MLPClassifier`)  for each dataset and each target. Moreover, we will have a version of each with the original dataset size and a second one with a resampled version of it.

Though more metrics, all of them weighted ones, will be analyzed, I will focus on F1 score. In my classification, precision is not more relevant than exhaustivity neither the opposite, plus F1 is much less prompt to overfitting or underfitting issues compared to accuracy, especially considering we will use weighted measures.

### Using types

#### Truncated SVD 

##### original sample

In [None]:
result_svd_vec_types  = pd.read_csv("../input/2-mbti-preprocessing/result_svd_vec_types.csv")
result_svd_vec_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
result_svd_vec_types.head()

In [None]:
result_svd_vec_types.shape

In [None]:
X = result_svd_vec_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_svd_vec_types["type"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [None]:
#raise SystemExit("Here it comes a very consuming memory process that takes about 45 minutes")

In [None]:
# Evaluation of models
models_svd = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_svd.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_svd.to_csv("models_svd.csv")
models_svd

###### Feature importance

In [None]:
xgboost = GradientBoostingClassifier().fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(10,20))
viz = FeatureImportances(xgboost)
viz.fit(X, y)
viz.show()
viz.show(outpath="feature_importance_types.png")
sns.set_context("talk")
plt.show()

##### resampled

In [None]:
def sampling_k_elements(group, k=39):
    if len(group) < k:
        return group
    return group.sample(k)

balanced_svd = result_svd_vec_types.groupby("type").apply(sampling_k_elements).reset_index(drop=True)

In [None]:
X = balanced_svd.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = balanced_svd["type"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity': [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_svd_resampled = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_svd_resampled.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_svd_resampled.to_csv("models_svd_resampled.csv")
models_svd_resampled

#### UMAP

##### original sample

In [None]:
result_umap_types  = pd.read_csv("../input/2-mbti-preprocessing/result_umap_types.csv")
result_umap_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
result_umap_types.head()

In [None]:
result_umap_types.shape

In [None]:
X = result_umap_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_umap_types["type"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity': [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_umap = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_umap.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_umap.to_csv("models_umap.csv")
models_umap

##### resampled

In [None]:
def sampling_k_elements(group, k=39):
    if len(group) < k:
        return group
    return group.sample(k)

balanced_umap = result_umap_types.groupby("type").apply(sampling_k_elements).reset_index(drop=True)

In [None]:
X = balanced_umap.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = balanced_umap["type"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity': [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_umap_resampled = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_umap_resampled.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_umap_resampled.to_csv("models_umap_resampled.csv")
models_umap_resampled

#### UMAP on TSVD

##### original sample

In [None]:
result_umap_svd_types  = pd.read_csv("../input/2-mbti-preprocessing/result_umap_svd_types.csv")
result_umap_svd_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
result_umap_svd_types.head()

In [None]:
result_umap_svd_types.shape

In [None]:
X = result_umap_svd_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_umap_svd_types["type"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity': [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_umap_svd = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_umap_svd.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_umap_svd.to_csv("models_umap_svd.csv")
models_umap_svd

##### resampled

In [None]:
def sampling_k_elements(group, k=39):
    if len(group) < k:
        return group
    return group.sample(k)

balanced_umap_svd = result_umap_svd_types.groupby("type").apply(sampling_k_elements).reset_index(drop=True)

In [None]:
X = balanced_umap_svd.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = balanced_umap_svd["type"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity': [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_umap_svd_resampled = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_umap_svd_resampled.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_umap_svd_resampled.to_csv("models_umap_svd_resampled.csv")
models_umap_svd_resampled

### Using dimensions

Before proceeding further to Deep Learning methods and fine tuning of the models previously evalueted, we will try training the models on each 4 dimensions using the methods and sample that worked better.

#### Truncated SVD with the original sample

In [None]:
result_svd_vec_dimensions  = pd.read_csv("../input/2-mbti-preprocessing/result_svd_vec_dimensions.csv")
result_svd_vec_dimensions.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
result_svd_vec_dimensions.head()

In [None]:
result_svd_vec_dimensions.shape

##### Introversion (I) – Extroversion (E)

In [None]:
X = result_svd_vec_dimensions.drop(["type","i-e", "n-s", "t-f", "j-p"], axis=1).values
y = result_svd_vec_dimensions["i-e"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_ie = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_ie.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_ie.to_csv("models_ie.csv")
models_ie

###### Feature importance

In [None]:
xgboost = GradientBoostingClassifier().fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(10,20))
viz = FeatureImportances(xgboost)
viz.fit(X, y)
viz.show()
viz.show(outpath="feature_importance_i-e.png")
sns.set_context("talk")
plt.show()

##### Intuition (N) – Sensing (S)

In [None]:
X = result_svd_vec_dimensions.drop(["type","i-e", "n-s", "t-f", "j-p"], axis=1).values
y = result_svd_vec_dimensions["n-s"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_ns = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_ns.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_ns.to_csv("models_ns.csv")
models_ns

###### Feature importance

In [None]:
xgboost = GradientBoostingClassifier().fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(10,20))
viz = FeatureImportances(xgboost)
viz.fit(X, y)
viz.show()
viz.show(outpath="feature_importance_n-s.png")
sns.set_context("talk")
plt.show()

##### Thinking (T) – Feeling (F)

In [None]:
X = result_svd_vec_dimensions.drop(["type","i-e", "n-s", "t-f", "j-p"], axis=1).values
y = result_svd_vec_dimensions["t-f"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_tf = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_tf.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_tf.to_csv("models_tf.csv")
models_tf

###### Feature importance

In [None]:
xgboost = GradientBoostingClassifier().fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(10,20))
viz = FeatureImportances(xgboost)
viz.fit(X, y)
viz.show()
viz.show(outpath="feature_importance_t-f.png")
sns.set_context("talk")
plt.show()

##### Judging (J) – Perceiving (P)

In [None]:
X = result_svd_vec_dimensions.drop(["type","i-e", "n-s", "t-f", "j-p"], axis=1).values
y = result_svd_vec_dimensions["j-p"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy', n_jobs=-1))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted', n_jobs=-1))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted', n_jobs=-1))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted', n_jobs=-1))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [None]:
models = {'gnb': GaussianNB(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [None]:
# Evaluation of models
models_jp = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_jp.sort_values(by='f1score', axis=0, ascending=False, inplace=True)
models_jp.to_csv("models_jp.csv")
models_jp

###### Feature importance

In [None]:
xgboost = GradientBoostingClassifier().fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(10,20))
viz = FeatureImportances(xgboost)
viz.fit(X, y)
viz.show()
viz.show(outpath="feature_importance_j-p.png")
sns.set_context("talk")
plt.show()

**Comments**

In [None]:
dimensions = models_ie.iloc[0,4] * models_ns.iloc[0,4] * models_tf.iloc[0,4] * models_jp.iloc[0,4]
types = models_svd.iloc[0,4]

print("F1 Scores:")
print("Types =", types,"vs","Dimensions =", dimensions)

So, in the end, the model trained using types predicts better that applying the 4 models for the different dimensions consecutively, unless we are particularly interested in 1 of the personality dimensions.  In that case using the model for that particular dimension would be my recommendation. 

It also seems, attending to feature importances, than words_per_comment and variance_of_word_counts were not relevant in the models' training to predict the types and dimensions. 