In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

df = pd.read_csv('mbti_1.csv')

In [2]:
def filter_word(word):
    if bool(re.search(r'[\d]', word)) or \
            bool(re.search('[^\w]', word)) or \
            bool(re.search(r'http', word)) or \
            word in stop_words:
        return False
    else:
        return True

def preprocess_post(post):
    post = word_tokenize(post)
    
    post = [porter.stem(word.lower()) for word in post if filter_word(word.lower())]
    
    post = ' '.join(post)
    
    return post

In [3]:
def get_bag_of_words(df, word_column, min_df):
    posts_by_user = [row[word_column].split('|||') for _, row in df.iterrows()]
    corpus = [' '.join([preprocess_post(post) for post in posts]) for posts in posts_by_user]
    
    vectorizer = CountVectorizer(min_df=min_df)
    X = vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names(), X

In [4]:
words, bag_of_words = get_bag_of_words(df, 'posts', 2)
print(words)

['__', '___', '____', '_____', '______', '_______', '________', '_________', '__________', '___________', '_____________', '_______________', '_________________', '_____________________', '___________________________________________________________________________________', '__fp', '_ionic', '_nfp', '_ntj', '_observer_', '_sfp', 'aa', 'aaa', 'aaaa', 'aaaaaaaaa', 'aaaaaaaaaaaaaaaaa', 'aaaaaaaaah', 'aaaaaaaargh', 'aaaaaaah', 'aaaaaah', 'aaaaaand', 'aaaaah', 'aaaaall', 'aaaaand', 'aaaah', 'aaaahh', 'aaaahhh', 'aaaahhhh', 'aaaahhhhh', 'aaaall', 'aaaand', 'aaaargh', 'aaadd', 'aaag', 'aaah', 'aaahh', 'aaall', 'aaand', 'aaannnnddd', 'aaby', 'aag', 'aah', 'aahhh', 'aaliyah', 'aan', 'aand', 'aang', 'aapprriil', 'aargh', 'aaron', 'ab', 'aba', 'aback', 'abagnal', 'abandon', 'abas', 'abash', 'abasi', 'abat', 'abba', 'abbey', 'abbi', 'abbigailiu', 'abbott', 'abbrevi', 'abc', 'abcd', 'abdc', 'abdomen', 'abdomin', 'abduct', 'abdurrahman', 'abe', 'abel', 'aber', 'abercrombi', 'aberr', 'abhor', 'abhorr

In [5]:
import numpy as np
from scipy import sparse
from scipy import stats
import statsmodels.api as sm
from scipy.sparse import vstack
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

MBTI = df['type']

def svm_kfold(feature, label, k):
    acc = [[] for i in range(4)]
    overall_acc = []
    num_validation_samples = len(label) // k
    
    # k-fold cross validation
    for fold in range(k):
        print("fold ", fold + 1)
        blabel = []
        bpredict = []
        x_val = feature.tocsr()[num_validation_samples * fold : num_validation_samples * (fold + 1)][:]
        
        # each dimension in MBTI
        for dimension in range(4):
            y = np.asarray([label[k][dimension] for k in range(len(label))])
            y = sm.tools.categorical(y, drop=True)[:,0]
            
            x_train = vstack([feature.tocsr()[:num_validation_samples * fold][:], feature.tocsr()[num_validation_samples * (fold + 1):][:]])
            y_train = np.array(list(y[:num_validation_samples * fold]) + list(y[num_validation_samples * (fold + 1):]))
            y_val = np.array(y[num_validation_samples * fold : num_validation_samples * (fold + 1)])
            blabel.append(y_val)
            
            svm = SVC()
            svm.fit(x_train, y_train)
            y_pred = svm.predict(x_val)
            bpredict.append(y_pred)
            
            acc_score = accuracy_score(y_val, y_pred)
            acc[dimension].append(acc_score)
            print(acc_score)
            
        # overall accuracy
        a = [1 if list(np.array(bpredict)[:,i]) ==  list(np.array(blabel)[:,i]) else 0 for i in range(len(y_val))]
        overall_acc.append(sum(a)/len(a))
        print("overall acc:", sum(a)/len(a))
        
    mean_acc = np.mean(np.array(acc), axis=1)
    return mean_acc, overall_acc
        
acc, overallacc = svm_kfold(bag_of_words, MBTI, 6)
print(acc)
print(np.mean(np.array(overallacc)))

fold  1




0.7937716262975778
0.8678200692041522
0.8505190311418686
0.7487889273356402
overall acc: 0.4602076124567474
fold  2
0.7771626297577855
0.8581314878892734
0.8207612456747405
0.726643598615917
overall acc: 0.43252595155709345
fold  3
0.8034602076124567
0.8532871972318339
0.8512110726643599
0.726643598615917
overall acc: 0.46782006920415226
fold  4
0.7965397923875432
0.8823529411764706
0.8380622837370242
0.7321799307958478
overall acc: 0.4539792387543253
fold  5
0.8069204152249135
0.8505190311418686
0.8394463667820069
0.7280276816608997
overall acc: 0.4422145328719723
fold  6
0.7923875432525952
0.8602076124567474
0.8429065743944637
0.7169550173010381
overall acc: 0.43944636678200694
[0.79504037 0.86205306 0.84048443 0.72987313]
0.44936562860438295
