# Myers–Briggs Type Indicator (MBTI) Personality Prediction

## Libraries and Global Settings

In [16]:
import spacy
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
import re
import pickle
import numpy as np
import warnings
from ipywidgets import widgets, interact
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

warnings.filterwarnings('ignore')

## Preprocessing

In [None]:
df = pd.read_csv('mbti_1.csv', encoding='utf8')
df.head(10)

In [17]:
df = pd.read_csv('mbti_normalized.csv', encoding='utf8')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       8675 non-null   int64 
 1   type             8675 non-null   object
 2   posts            8675 non-null   object
 3   normalized_text  8674 non-null   object
dtypes: int64(1), object(3)
memory usage: 271.2+ KB


In [19]:
df.posts[0]

"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

In [None]:
#train_data,test_data = train_test_split(df,test_size=0.2,random_state=42,stratify=df.type)

### Text Normalization

In [None]:
def normalizer(sentence):
 
    # Remove ||| from kaggle dataset
    sentence = re.sub("[]|||[]", " ", sentence)

    # remove reddit subreddit urls
    sentence = re.sub("/r/[0-9A-Za-z]", "", sentence)

    # remove MBTI types
    MBTI_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
              'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ',
              'MBTI']
    MBTI_types = [ti.lower() for ti in MBTI_types] + [ti.lower() + 's' for ti in MBTI_types]

    tokens = nlp(sentence)

    tokens = [ti for ti in tokens if ti.lower_ not in STOP_WORDS]
    tokens = [ti for ti in tokens if not ti.is_space]
    tokens = [ti for ti in tokens if not ti.is_punct]
    tokens = [ti for ti in tokens if not ti.like_num]
    tokens = [ti for ti in tokens if not ti.like_url]
    tokens = [ti for ti in tokens if not ti.like_email]
    tokens = [ti for ti in tokens if ti.lower_ not in MBTI_types]


    # lemmatize
    tokens = [ti.lemma_ for ti in tokens if ti.lemma_ not in STOP_WORDS]
    tokens = [ti for ti in tokens if len(ti) > 1]
    
    text = " ".join(tokens)
    return text

In [None]:
df['normalized_text'] = df.posts.apply(normalizer)

In [None]:
df.to_csv('mbti_normalized.csv')

In [20]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,type,posts,normalized_text
0,0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,moment sportscenter play prank life change exp...
1,1,ENTP,'I'm finding the lack of me in these posts ver...,find lack post alarming sex boring position ex...
2,2,INTP,'Good one _____ https://www.youtube.com/wat...,good course know blessing curse absolutely pos...
3,3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear enjoy conversation day esoteric gabbe nat...
4,4,ENTJ,'You're fired.|||That's another silly misconce...,fire silly misconception approach logically ke...


In [22]:
# Drop unnamed columns
df = df.drop(df.columns[0], axis=1)
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,normalized_text
0,moment sportscenter play prank life change exp...
1,find lack post alarming sex boring position ex...
2,good course know blessing curse absolutely pos...
3,dear enjoy conversation day esoteric gabbe nat...
4,fire silly misconception approach logically ke...
...,...
8670,IxFP think cat Fi dom reason especially websit...
8671,thread exist someplace heck delete ooop guess ...
8672,question thing purple pill pick win lottery nu...
8673,conflicted right come want child honestly mate...


### Vectorization

In [23]:
vectorizer=TfidfVectorizer()
vectorizer.fit_transform(df)

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [25]:
post = vectorizer.transform(df).toarray()

In [26]:
post.shape

(1, 1)

In [None]:
df['vectorized_text'] = post

## Machine Learning
### K-Nearest Neighbors (KNN) with GridSearchCV

In [None]:
train, test = train_test_split(df, test_size=.2)

In [None]:
train

In [None]:
X_train = 
y_train = 
X_test = 
y_test = 

In [None]:
# Elbow method to determine the optimal K (n_neighbors)
error_rate = []

for i in range(1, 300):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn_opn.predict(X_test)
    error_rate.append(numpy.mean(pred_i != numpy.asarray(y_test)))

# Plotting a Line graphs of the error rate
plt.figure(figsize=(15,9))
plt.plot(range(1, 300), error_rate_opn, color='blue', linestyle='dashed', label='Openness')
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()

In [None]:
# Optimal K (n_neighbors)
k = 25

In [None]:
# Lists of hyperparameters
leaf_size = list(range(1, 30))
p=[1, 2]
weights = ['uniform', 'distance'],
metric = ['minkowski', 'euclidean', 'manhattan']

# Hyperparameters dictionary
hyperparameters = dict(leaf_size=leaf_size, p=p, metric=metric)

In [None]:
# Build KNN classifier with hyperparameter tuning
model = GridSearchCV(KNeighborsClassifier(n_neighbors=k), hyperparameters, scoring='accuracy', cv=5)
model = model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)