In [208]:
import pandas as pd
import numpy as np
import re

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

# read data
mbti = pd.read_csv('.\data\\training\\mbti.csv') 
mbti.head(10)

Unnamed: 0,type,posts
0,ENFJ,'https://www.youtube.com/watch?v=PLAaiKvHvZs||...
1,ENFJ,https://www.youtube.com/watch?v=AwgF14ySLpw I...
2,ENFJ,'That sounds like a beautiful relationship alr...
3,ENFJ,'I've always thought of Tony Stark as more of ...
4,ENFJ,'ABILITY TO TRANSFORM. Form of... a bucket of...
5,ENFJ,It burns!! Haha|||http://personalitycafe.com/m...
6,ENFJ,'http://www.youtube.com/watch?v=3mokC24vTPI|||...
7,ENFJ,"'I had an ESTJ boss, who was a kinda control f..."
8,ENFJ,'ENFJ with a concussion: A Case Study. :dry: ...
9,ENFJ,"What arguments? There were none. You stated, b..."


In [209]:
#[p.split('|||') for p in data.head(2).posts.values]

In [210]:
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    
    return [b_Pers[l] for l in personality]

def translate_back(personality):
    # transform binary vector to mbti personality
    
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s

# Check ...
d = mbti.head(4)
list_personality_bin = np.array([translate_personality(p) for p in d.type])
print(d.type)
print("Binarize MBTI list: \n%s" % list_personality_bin)

0    ENFJ
1    ENFJ
2    ENFJ
3    ENFJ
Name: type, dtype: object
Binarize MBTI list: 
[[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]]


In [211]:
##### Compute list of subject with Type | list of comments 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import word_tokenize

# We want to remove these from the psosts
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
  
unique_type_list = [x.lower() for x in unique_type_list]


# Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

# Cache the stop words for speed 
cachedStopWords = stopwords.words("english")

def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):

    list_personality = []
    list_posts = []
    len_data = len(data)
    i = 0
    
    for row in data.iterrows():
        i+=1
        if (i % 500 == 0 or i == 1 or i == len_data):
            print("%s of %s rows" % (i, len_data))

        ##### Remove and clean comments
        posts = row[1].posts
        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
        temp = re.sub("[^a-zA-Z]", " ", temp)
        temp = re.sub(' +', ' ', temp).lower()
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
            
        if remove_mbti_profiles:
            for t in unique_type_list:
                temp = temp.replace(t,"")

        type_labelized = translate_personality(row[1].type)
        list_personality.append(type_labelized)
        list_posts.append(temp)

    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    
    return list_posts, list_personality

In [212]:
list_posts, list_personality  = pre_process_data(mbti, remove_stop_words=True)

1 of 8675 rows
500 of 8675 rows
1000 of 8675 rows
1500 of 8675 rows
2000 of 8675 rows
2500 of 8675 rows
3000 of 8675 rows
3500 of 8675 rows
4000 of 8675 rows
4500 of 8675 rows
5000 of 8675 rows
5500 of 8675 rows
6000 of 8675 rows
6500 of 8675 rows
7000 of 8675 rows
7500 of 8675 rows
8000 of 8675 rows
8500 of 8675 rows
8675 of 8675 rows


In [213]:
print("Num posts and personalities: ",  list_posts.shape, list_personality.shape)
list_posts[0]
list_personality[0]

Num posts and personalities:  (8675,) (8675, 4)


array([1, 0, 0, 0])

In [214]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

# Posts to a matrix of token counts
cntizer = CountVectorizer(  analyzer="word", 
                            max_features=1000, 
                            tokenizer=None,    
                            preprocessor=None, 
                            stop_words=None,  
                            max_df=0.7,
                            min_df=0.1) 

# Learn the vocabulary dictionary and return term-document matrix
print("CountVectorizer...")
X_cnt = cntizer.fit_transform(list_posts)

# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

print("Tf-idf...")
# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
X_tfidf =  tfizer.fit_transform(X_cnt)

CountVectorizer...
Tf-idf...


In [215]:
feature_names = list(enumerate(cntizer.get_feature_names()))
print(feature_names)
print("X: Posts in tf-idf representation \n* 1st row:\n%s" % X_tfidf[0])

[(0, 'ability'), (1, 'able'), (2, 'absolutely'), (3, 'accept'), (4, 'accurate'), (5, 'across'), (6, 'act'), (7, 'action'), (8, 'actual'), (9, 'actually'), (10, 'add'), (11, 'admit'), (12, 'advice'), (13, 'afraid'), (14, 'age'), (15, 'ago'), (16, 'agree'), (17, 'ah'), (18, 'almost'), (19, 'alone'), (20, 'along'), (21, 'already'), (22, 'although'), (23, 'amazing'), (24, 'amount'), (25, 'angry'), (26, 'animal'), (27, 'annoying'), (28, 'another'), (29, 'answer'), (30, 'anxiety'), (31, 'anymore'), (32, 'anyone'), (33, 'anything'), (34, 'anyway'), (35, 'apparently'), (36, 'appreciate'), (37, 'approach'), (38, 'area'), (39, 'argument'), (40, 'around'), (41, 'art'), (42, 'ask'), (43, 'asked'), (44, 'asking'), (45, 'aspect'), (46, 'assume'), (47, 'attention'), (48, 'attracted'), (49, 'avatar'), (50, 'avoid'), (51, 'aware'), (52, 'away'), (53, 'awesome'), (54, 'awkward'), (55, 'baby'), (56, 'back'), (57, 'bad'), (58, 'based'), (59, 'basically'), (60, 'beautiful'), (61, 'become'), (62, 'bed'), (6

In [216]:
type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) – Sensing (S)", 
                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"  ]

for l in range(len(type_indicators)):
    print(type_indicators[l])

IE: Introversion (I) / Extroversion (E)
NS: Intuition (N) – Sensing (S)
FT: Feeling (F) - Thinking (T)
JP: Judging (J) – Perceiving (P)


In [217]:
print("MBTI 1st row: %s" % translate_back(list_personality[0,:]))
print("Y: Binarized MBTI 1st row: %s" % list_personality[0,:])

MBTI 1st row: ENFJ
Y: Binarized MBTI 1st row: [1 0 0 0]


In [218]:
# First XGBoost model for MBTI dataset
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score


# Posts in tf-idf representation
X = X_tfidf

# Let's train type indicator individually
for l in range(len(type_indicators)):
    print("%s ..." % (type_indicators[l]))
    
    Y = list_personality[:,l]
    
    X_resampled, y_resampled = SMOTE(random_state=0).fit_sample(X, Y)
    
    # model building
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=1234)
    model = XGBClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # evaluation, validation score
    labels = predictions
    guesses = y_test

    accuracy = accuracy_score(labels, guesses)
    print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
    print(recall_score(labels, guesses, average=None))
    print(precision_score(labels, guesses, average=None))
    print(f1_score(labels, guesses, average=None))
    print(Counter(Y))
    print(Counter(y_resampled))



skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
score = cross_val_score(model, X_train, y_train, cv=skf )
print(score)
print(score.mean())

IE: Introversion (I) / Extroversion (E) ...
* IE: Introversion (I) / Extroversion (E) Accuracy: 84.37%
[0.78047741 0.93313253]
[0.94284243 0.7504845 ]
[0.85401119 0.83190118]
Counter({0: 6676, 1: 1999})
Counter({1: 6676, 0: 6676})
NS: Intuition (N) – Sensing (S) ...
* NS: Intuition (N) – Sensing (S) Accuracy: 91.44%
[0.87883749 0.95694716]
[0.9606264  0.86856128]
[0.91791364 0.91061453]
Counter({0: 7478, 1: 1197})
Counter({0: 7478, 1: 7478})
FT: Feeling (F) - Thinking (T) ...
* FT: Feeling (F) - Thinking (T) Accuracy: 74.55%
[0.73825967 0.75310446]
[0.75977257 0.73120567]
[0.74886165 0.74199352]
Counter({0: 4694, 1: 3981})
Counter({0: 4694, 1: 4694})
JP: Judging (J) – Perceiving (P) ...
* JP: Judging (J) – Perceiving (P) Accuracy: 70.21%
[0.74085138 0.67331118]
[0.62705436 0.77799104]
[0.67921945 0.72187593]
Counter({1: 5241, 0: 3434})
Counter({0: 5241, 1: 5241})
[0.69618529 0.69414169 0.68779823 0.68166326 0.69120654]
0.6901990010940046


# Harry Potter mbti

In [219]:
#sample my_posts is ndarray
hp = pd.read_csv('.\data\\test\\HP_RON.csv')
hp_list = hp['Sentence'].tolist()
hp_string = ''.join([str(elem) for elem in hp_list])
print(hp_string)

# The type is just a dummy so that the data prep fucntion can be reused
mydata = pd.DataFrame(data={'type': ['ENFJ'], 'posts': [hp_string]})

hp_string, dummy = pre_process_data(mydata, remove_stop_words=True)

my_X_cnt = cntizer.transform(hp_string)
my_X_tfidf =  tfizer.transform(my_X_cnt)

1 of 1 rows


In [220]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score

# setup parameters for xgboost
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2

result = []
# Let's train type indicator individually
for l in range(len(type_indicators)):
    print("%s ..." % (type_indicators[l]))
    
    Y = list_personality[:,l]

    # modeling
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    
    # make predictions for my  data
    y_pred = model.predict(my_X_tfidf)
    result.append(y_pred[0])
    

IE: Introversion (I) / Extroversion (E) ...
NS: Intuition (N) – Sensing (S) ...
FT: Feeling (F) - Thinking (T) ...
JP: Judging (J) – Perceiving (P) ...


In [221]:
print("oo's MBTI is...: ", translate_back(result))

oo's MBTI is...:  INFP
