In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

df = pd.read_csv('mbti_1.csv')

In [2]:
def filter_word(word):
    if bool(re.search(r'[\d]', word)) or \
            bool(re.search('[^\w]', word)) or \
            bool(re.search(r'http', word)) or \
            word in stop_words:
        return False
    else:
        return True

def preprocess_post(post):
    post = word_tokenize(post)
    
    post = [porter.stem(word.lower()) for word in post if filter_word(word.lower())]
    
    post = ' '.join(post)
    
    return post

In [4]:
def get_ngrams(n, word_column='posts',df=pd.read_csv('mbti_1.csv')):
    '''
    input:
        - df: pandas dataframe
        - word_column: string of column name that contains the corpus in the df
        - n: maximal length of ngrams
    output:
        - list of words in the bag of words
        - the bag of words matrix for each row and word
    '''
    posts_by_user = [row[word_column].split('|||') for _, row in df.iterrows()]
    corpus = [' '.join([preprocess_post(post) for post in posts]) for posts in posts_by_user]
    
    ngram_vectorizer = CountVectorizer(binary=False, ngram_range=(1, n), min_df=2)
    X = ngram_vectorizer.fit_transform(corpus)
    
    return ngram_vectorizer.get_feature_names(), X

In [5]:
ngrams, ngram_user = get_ngrams(2)
print(ngram_user.shape)

(8675, 566706)


In [6]:
import numpy as np
from scipy import sparse
from scipy import stats
import statsmodels.api as sm
from scipy.sparse import vstack
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

MBTI = df['type']

def lg_kfold(feature, label, k):
    acc = [[] for i in range(4)]
    overall_acc = []
    num_validation_samples = len(label) // k
    
    # k-fold cross validation
    for fold in range(k):
        print("fold ", fold + 1)
        blabel = []
        bpredict = []
        x_val = feature.tocsr()[num_validation_samples * fold : num_validation_samples * (fold + 1)][:]
        
        # each dimension in MBTI
        for dimension in range(4):
            y = np.asarray([label[k][dimension] for k in range(len(label))])
            y = sm.tools.categorical(y, drop=True)[:,0]
            
            x_train = vstack([feature.tocsr()[:num_validation_samples * fold][:], feature.tocsr()[num_validation_samples * (fold + 1):][:]])
            y_train = np.array(list(y[:num_validation_samples * fold]) + list(y[num_validation_samples * (fold + 1):]))
            y_val = np.array(y[num_validation_samples * fold : num_validation_samples * (fold + 1)])
            blabel.append(y_val)
            
            clf = LogisticRegression(random_state=0, solver='liblinear')
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_val)
            bpredict.append(y_pred)
            
            acc_score = accuracy_score(y_val, y_pred)
            acc[dimension].append(acc_score)
            print(acc_score)
            
        # overall accuracy
        a = [1 if list(np.array(bpredict)[:,i]) ==  list(np.array(blabel)[:,i]) else 0 for i in range(len(y_val))]
        overall_acc.append(sum(a)/len(a))
        print("overall acc:", sum(a)/len(a))
        
    mean_acc = np.mean(np.array(acc), axis=1)
    return mean_acc, overall_acc
        
acc, overallacc = lg_kfold(ngram_user, MBTI, 6)
print(acc)
print(np.mean(np.array(overallacc)))

fold  1
0.8463667820069204
0.9038062283737024
0.8491349480968858
0.7903114186851211
overall acc: 0.5612456747404845
fold  2
0.8484429065743945
0.9010380622837371
0.8394463667820069
0.7896193771626298
overall acc: 0.5647058823529412
fold  3
0.8505190311418686
0.8941176470588236
0.8491349480968858
0.8166089965397924
overall acc: 0.5778546712802768
fold  4
0.8581314878892734
0.9072664359861592
0.8484429065743945
0.7958477508650519
overall acc: 0.5674740484429066
fold  5
0.8519031141868512
0.8865051903114187
0.8422145328719723
0.7806228373702422
overall acc: 0.5494809688581315
fold  6
0.8422145328719723
0.9100346020761245
0.8422145328719723
0.7681660899653979
overall acc: 0.542560553633218
[0.84959631 0.90046136 0.84509804 0.79019608]
0.5605536332179931


In [None]:
from sklearn.svm import SVC

def svm_kfold(feature, label, k):
    acc = [[] for i in range(4)]
    overall_acc = []
    num_validation_samples = len(label) // k
    
    # k-fold cross validation
    for fold in range(k):
        print("fold ", fold + 1)
        blabel = []
        bpredict = []
        x_val = feature.tocsr()[num_validation_samples * fold : num_validation_samples * (fold + 1)][:]
        
        # each dimension in MBTI
        for dimension in range(4):
            y = np.asarray([label[k][dimension] for k in range(len(label))])
            y = sm.tools.categorical(y, drop=True)[:,0]
            
            x_train = vstack([feature.tocsr()[:num_validation_samples * fold][:], feature.tocsr()[num_validation_samples * (fold + 1):][:]])
            y_train = np.array(list(y[:num_validation_samples * fold]) + list(y[num_validation_samples * (fold + 1):]))
            y_val = np.array(y[num_validation_samples * fold : num_validation_samples * (fold + 1)])
            blabel.append(y_val)
            
            svm = SVC()
            svm.fit(x_train, y_train)
            y_pred = svm.predict(x_val)
            bpredict.append(y_pred)
            
            acc_score = accuracy_score(y_val, y_pred)
            acc[dimension].append(acc_score)
            print(acc_score)
            
        # overall accuracy
        a = [1 if list(np.array(bpredict)[:,i]) ==  list(np.array(blabel)[:,i]) else 0 for i in range(len(y_val))]
        overall_acc.append(sum(a)/len(a))
        print("overall acc:", sum(a)/len(a))
        
    mean_acc = np.mean(np.array(acc), axis=1)
    return mean_acc, overall_acc
        
acc, overallacc = svm_kfold(ngram_user, MBTI, 6)
print(acc)
print(np.mean(np.array(overallacc)))

fold  1




0.7716262975778547
0.8678200692041522
0.5570934256055363
0.6235294117647059
overall acc: 0.24083044982698962
fold  2
0.7550173010380623
0.8581314878892734
0.5155709342560554
0.5965397923875433
overall acc: 0.18685121107266436
fold  3
0.7750865051903114
0.8532871972318339
0.5453287197231834
0.5896193771626298
overall acc: 0.20346020761245676
fold  4
0.7647058823529411
0.8823529411764706
0.554325259515571
0.6
overall acc: 0.21868512110726643
fold  5
0.7826989619377163
0.8505190311418686
0.5370242214532872
0.6145328719723183
overall acc: 0.21730103806228374
fold  6
0.7681660899653979
0.8602076124567474
