<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

## Imports

In [None]:
"""
You'll first need to download glove.6B.100d.txt from https://nlp.stanford.edu/projects/glove/ and save it to the following
path (NLP-to-predict-Myers-Briggs-Personality-Type/glove_data/glove.6B/)of your local version of my project's reopsitory
"""

In [None]:
import re
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import math

import pickle as pkl

from sklearn.model_selection import train_test_split

import keras.metrics
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight
from keras.layers import *
from keras.models import Model

## 4. Model building and evaluation: Deep Learning

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')

In [None]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "../input/nlp-to-predict-myers-briggs-personality-type/NLP-to-predict-Myers-Briggs-Personality-Type/glove_data/glove.6B/glove.6B.100d.txt"

### Using Types

In [None]:
mbti_df_clean = pd.read_pickle("../input/2-mbti-preprocessing/mbti_clean_text.pkl")
result_umap_types  = pd.read_csv("../input/2-mbti-preprocessing/result_umap_types.csv")

In [None]:
mbti_df_clean.head()

In [None]:
result_umap_types.head()

In [None]:
neg, pos = np.bincount(result_umap_types["enfj"])
total = neg + pos
print('Total: {}\n    enfj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["enfp"])
total = neg + pos
print('Total: {}\n    enfp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["entj"])
total = neg + pos
print('Total: {}\n    entj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["entp"])
total = neg + pos
print('Total: {}\n    entp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["esfj"])
total = neg + pos
print('Total: {}\n    esfj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["esfp"])
total = neg + pos
print('Total: {}\n    esfp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["estj"])
total = neg + pos
print('Total: {}\n    estj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["estp"])
total = neg + pos
print('Total: {}\n    estp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["infj"])
total = neg + pos
print('Total: {}\n    infj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["infp"])
total = neg + pos
print('Total: {}\n    infp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["intj"])
total = neg + pos
print('Total: {}\n    intj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["intp"])
total = neg + pos
print('Total: {}\n    intp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["isfj"])
total = neg + pos
print('Total: {}\n    isfj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["isfp"])
total = neg + pos
print('Total: {}\n    isfp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["istj"])
total = neg + pos
print('Total: {}\n    istj: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_types["istp"])
total = neg + pos
print('Total: {}\n    istp: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

In [None]:
labels_dict = {0: 190 , 1: 675, 2: 231, 3: 685, 4: 42, 5: 48, 6: 39, 7: 89, 8: 1470,
                9: 1832, 10: 1091, 11: 1304, 12: 166, 13: 271, 14: 205, 15: 337}

In [None]:
def create_class_weight(labels_dict):
    total = 8675
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight

class_weights = create_class_weight(labels_dict)

In [None]:
labels = ["enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp", "infj", "infp", "intj", "intp", "isfj", 
          "isfp", "istj", "istp"]
y = result_umap_types[labels].values
X = mbti_df_clean["posts_clean"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
X_train = list(X_train)

In [None]:
texts = [line for line in X_train] 

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

In [None]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

In [None]:
data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y_train.shape)

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y_train[indices]

In [None]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])
x_train = data[: -num_validation_samples]
y_train = labels[: -num_validation_samples]
x_val = data[-num_validation_samples: ]
y_val = labels[-num_validation_samples: ]

print ((x_train.shape),(y_train.shape),(x_val.shape),(y_val.shape))

In [None]:
print('Number of entries in each category:')
print('training: ', y_train.sum(axis=0))
print('validation: ', y_val.sum(axis=0))

In [None]:
print('Tokenized sentences: \n', data[10])
print('One hot label: \n', labels[10])

In [None]:
embeddings_index = {}
f = open(GLOVE_DIR, encoding="UTF-8")
print('Loading GloVe from:', GLOVE_DIR,'...', end='')
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n Proceeding with Embedding Matrix...", end="")

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(" Completed!")

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights = [embedding_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable=False,
                           name = 'embeddings')
embedded_sequences = embedding_layer(sequence_input)

In [None]:
x = LSTM(60, return_sequences=True,name='lstm_layer')(embedded_sequences)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
preds = Dense(16, activation="softmax")(x)

In [None]:
opt = keras.optimizers.Adam()

model = Model(sequence_input, preds)
model.compile(loss = 'categorical_crossentropy',
             optimizer=opt,
             weighted_metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
model.summary()

In [None]:
print('Training progress:')
history = model.fit(x_train, y_train, epochs = 15, batch_size=32, validation_data=(x_val, y_val), verbose=2, 
                    class_weight=class_weights)
keras.backend.clear_session()

In [None]:
hist_df = pd.DataFrame(history.history) 
hist_df.to_csv("types_hist_df.csv")
types_hist_df = hist_df


In [None]:
types_hist_df['val_f1'] = ((types_hist_df["val_precision"]*types_hist_df["val_recall"])/
                           (types_hist_df["val_precision"]+types_hist_df["val_recall"]))

In [None]:
types_hist_df = types_hist_df[['val_loss', 'loss', 'val_accuracy', 'accuracy', 'val_precision', 'precision',
                  'val_recall', 'recall', 'val_f1']]
types_hist_df

In [None]:
plt.figure(figsize=(18,10))
plt.plot(types_hist_df[['val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1']])

plt.title('Training and Validation: Types')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend(['val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1'])
plt.savefig("types_history.png")

sns.set_context("talk")
plt.show()

### Using dimensions

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [None]:
#raise SystemExit("Here it comes another quite consuming memory process. You should better not start it till everything else has itereated propperly")

In [None]:
mbti_df_clean = pd.read_pickle("../input/2-mbti-preprocessing/mbti_clean_text.pkl")
result_umap_dimensions  = pd.read_csv("../input/2-mbti-preprocessing/result_umap_dimensions.csv")

In [None]:
mbti_df_clean.head()

In [None]:
result_umap_dimensions.head()

In [None]:
neg, pos = np.bincount(result_umap_dimensions["i-e"])
total = neg + pos
print('Total: {}\n    i-e: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_dimensions["n-s"])
total = neg + pos
print('Total: {}\n    n-s: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_dimensions["t-f"])
total = neg + pos
print('Total: {}\n    t-f: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

neg, pos = np.bincount(result_umap_dimensions["j-p"])
total = neg + pos
print('Total: {}\n    j-p: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))


In [None]:
labels_dict = {0: 1999 , 1: 1997, 2: 4694, 3: 5241}

In [None]:
def create_class_weight(labels_dict):
    total = 8675
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight

class_weights = create_class_weight(labels_dict)

In [None]:
labels = ["i-e", "n-s", "t-f", "j-p"]
y = result_umap_dimensions[labels].values
X = mbti_df_clean["posts_clean"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [None]:
X_train = list(X_train)

In [None]:
texts = [line for line in X_train] 

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

In [None]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

In [None]:
data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y_train.shape)

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y_train[indices]

In [None]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])
x_train = data[: -num_validation_samples]
y_train = labels[: -num_validation_samples]
x_val = data[-num_validation_samples: ]
y_val = labels[-num_validation_samples: ]

print ((x_train.shape),(y_train.shape),(x_val.shape),(y_val.shape))

In [None]:
print('Number of entries in each category:')
print('training: ', y_train.sum(axis=0))
print('validation: ', y_val.sum(axis=0))

In [None]:
print('Tokenized sentences: \n', data[10])
print('One hot label: \n', labels[10])

In [None]:
embeddings_index = {}
f = open(GLOVE_DIR, encoding="UTF-8")
print('Loading GloVe from:', GLOVE_DIR,'...', end='')
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n Proceeding with Embedding Matrix...", end="")

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(" Completed!")

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights = [embedding_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable=False,
                           name = 'embeddings')
embedded_sequences = embedding_layer(sequence_input)

In [None]:
x = LSTM(60, return_sequences=True,name='lstm_layer')(embedded_sequences)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
preds = Dense(4, activation="softmax")(x)

In [None]:
opt = keras.optimizers.Adam()

model = Model(sequence_input, preds)
model.compile(loss = 'categorical_crossentropy',
             optimizer=opt,
             weighted_metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
model.summary()

In [None]:
print('Training progress:')
history = model.fit(x_train, y_train, epochs = 10, batch_size=32, validation_data=(x_val, y_val), verbose=2, 
                    class_weight=class_weights)
keras.backend.clear_session()

In [None]:
hist_df = pd.DataFrame(history.history) 
hist_df.to_csv("dimensions_hist_df.csv")
dimensions_hist_df = hist_df
dimensions_hist_df

In [None]:
dimensions_hist_df['val_f1'] = ((dimensions_hist_df["val_precision"]*dimensions_hist_df["val_recall"])/
                                (dimensions_hist_df["val_precision"]+dimensions_hist_df["val_recall"]))

In [None]:
dimensions_hist_df = dimensions_hist_df[['val_loss', 'loss', 'val_accuracy', 'accuracy', 'val_precision', 'precision',
                  'val_recall', 'recall', 'val_f1']]
dimensions_hist_df

In [None]:
plt.figure(figsize=(18,10))
plt.plot(dimensions_hist_df[['val_accuracy', 'val_precision', 'val_recall', 'val_f1']])

plt.title('Training and Validation: Dimensions')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend(['val_accuracy', 'val_precision', 'val_recall', 'val_f1'])
plt.savefig("dimensions_history.png")

sns.set_context("talk")
plt.show()

#### Comments

In [None]:
types = types_hist_df["val_f1"].max()
dimensions = dimensions_hist_df["val_f1"].max()

print("Best F1 Scores for both models, the one using types and the one using dimensions are of" ,types, "and" ,dimensions, " respectively, still much lower than the scores obtained using ML models.")

Best F1 Scores for both models, the one using types and the one using dimensions are of 0.205942 and 0.130249 respectively, still much lower than the scores obtained using ML models.