In [1]:
import pandas as pd

data = pd.read_csv('mbti_1.csv')

data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
MBTI = data[data.columns[0]]
posts = data[data.columns[1]]

for i in range(len(posts)):
    posts[i] = posts[i].lower()
    
print(MBTI[0])

INFJ


In [3]:
posts_by_user = []
for i in range(len(posts)):
    user_post = posts[i].split('|||')
    posts_by_user.append(user_post)

In [4]:
# number of posts per user
user_post_num = [0 for i in range(len(posts_by_user))]
for i in range(len(posts_by_user)):
    user_post_num[i] = len(posts_by_user[i])

In [5]:
# tokenize
import nltk
from nltk.tokenize import word_tokenize

posts_by_user = [[word_tokenize(y) for y in x] for x in posts_by_user]

print(posts_by_user[0][13])

['all', 'things', 'in', 'moderation', '.', 'sims', 'is', 'indeed', 'a', 'video', 'game', ',', 'and', 'a', 'good', 'one', 'at', 'that', '.', 'note', ':', 'a', 'good', 'one', 'at', 'that', 'is', 'somewhat', 'subjective', 'in', 'that', 'i', 'am', 'not', 'completely', 'promoting', 'the', 'death', 'of', 'any', 'given', 'sim', '...']


In [6]:
import re

for i in range(len(posts_by_user)):
    for j in range(len(posts_by_user[i])):
        index = 0
        while index < len(posts_by_user[i][j]):
            if bool(re.search(r'[\d]', posts_by_user[i][j][index])) or \
                bool(re.search('[^\w]', posts_by_user[i][j][index])) or \
                bool(re.search(r'http', posts_by_user[i][j][index])):
                posts_by_user[i][j].remove(posts_by_user[i][j][index])
                index -= 1
            index += 1
            
print(posts_by_user[0][13])

['all', 'things', 'in', 'moderation', 'sims', 'is', 'indeed', 'a', 'video', 'game', 'and', 'a', 'good', 'one', 'at', 'that', 'note', 'a', 'good', 'one', 'at', 'that', 'is', 'somewhat', 'subjective', 'in', 'that', 'i', 'am', 'not', 'completely', 'promoting', 'the', 'death', 'of', 'any', 'given', 'sim']


In [7]:
# stem: words are reduced to their root form
from nltk.stem import PorterStemmer

#create an object of class Porter Stemmer
porter = PorterStemmer()

for i in range(len(posts_by_user)):
    for j in range(len(posts_by_user[i])):
        posts_by_user[i][j] = [porter.stem(w) for w in posts_by_user[i][j]]
        
print(posts_by_user[0][13])

['all', 'thing', 'in', 'moder', 'sim', 'is', 'inde', 'a', 'video', 'game', 'and', 'a', 'good', 'one', 'at', 'that', 'note', 'a', 'good', 'one', 'at', 'that', 'is', 'somewhat', 'subject', 'in', 'that', 'i', 'am', 'not', 'complet', 'promot', 'the', 'death', 'of', 'ani', 'given', 'sim']


In [8]:
# lemmatize: words in third person are changed to first person and verbs in past and future tenses are changed into present
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for i in range(len(posts_by_user)):
    for j in range(len(posts_by_user[i])):
        posts_by_user[i][j] = [lemmatizer.lemmatize(w) for w in posts_by_user[i][j]]

In [9]:
# remove stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

for i in range(len(posts_by_user)):
    for j in range(len(posts_by_user[i])):
        posts_by_user[i][j] = [w for w in posts_by_user[i][j] if not w in stop_words]
        
print(posts_by_user[0][13])

['thing', 'moder', 'sim', 'inde', 'video', 'game', 'good', 'one', 'note', 'good', 'one', 'somewhat', 'subject', 'complet', 'promot', 'death', 'ani', 'given', 'sim']


In [10]:
for i in range(len(posts_by_user)):
    for j in range(len(posts_by_user[i])):
        posts_by_user[i][j] = ' '.join(posts_by_user[i][j])
print(posts_by_user[0][13])

thing moder sim inde video game good one note good one somewhat subject complet promot death ani given sim


In [11]:
for i in range(len(posts_by_user)):
    posts_by_user[i] = ' '.join(posts_by_user[i])

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorizer = CountVectorizer(binary=False, ngram_range=(1, 2))
ngram_user = ngram_vectorizer.fit_transform(posts_by_user)

print(ngram_user.shape)

(8675, 1986881)


In [14]:
import statsmodels.api as sm
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from scipy import sparse
from scipy import stats
from scipy.sparse import coo_matrix, vstack
from sklearn.svm import SVC

k = 6
user_num = len(MBTI)
num_validation_samples = user_num // k

# first shuffle the data
#np.random.shuffle(data)

acc = [[] for i in range(4)]

for fold in range(k):
    print(fold+1, " fold")
    x_val = ngram_user.tocsr()[num_validation_samples * fold : num_validation_samples * (fold + 1)][:]
    
    for dimension in range(4):
        y_label = [MBTI[k][dimension] for k in range(len(MBTI))]
        y_label = np.asarray(y_label)
        y_label = sm.tools.categorical(y_label, drop=True)
        y_label = y_label[:,0]
        
        x_train = vstack([ngram_user.tocsr()[:num_validation_samples * fold][:], ngram_user.tocsr()[num_validation_samples * (fold + 1):][:]])
        y_train = np.array(list(y_label[:num_validation_samples * fold]) + list(y_label[num_validation_samples * (fold + 1):]))
        y_val = np.array(y_label[num_validation_samples * fold : num_validation_samples * (fold + 1)])
        
        svm = SVC()
        svm.fit(x_train, y_train)
        y_pred = svm.predict(x_val)
        acc_score = accuracy_score(y_val, y_pred)
        print(acc_score)
        acc[dimension].append(acc_score)    

1  fold
0.7716262975778547
0.8678200692041522
0.556401384083045
0.6235294117647059
2  fold
0.7550173010380623
0.8581314878892734
0.5155709342560554
0.5965397923875433
3  fold
0.7750865051903114
0.8532871972318339
0.5446366782006921
0.5896193771626298
4  fold
0.7647058823529411
0.8823529411764706
0.554325259515571
0.6
5  fold
0.7826989619377163
0.8505190311418686
0.5370242214532872
0.6145328719723183
6  fold
0.7681660899653979
0.8602076124567474
0.5377162629757786
0.5993079584775086


In [16]:
mean_acc = np.mean(np.array(acc), axis=1)

print(mean_acc)

[0.76955017 0.86205306 0.54094579 0.60392157]
