In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer
import time
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import os
import plotly.express as px
import plotly
import seaborn as sns
import csv

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

'''
I used a slightly modified version of NaiveBayes classifier from AS1 here

'''

class MyBayesClassifier():
    def __init__(self, smooth=1):
        self._smooth = smooth # This is for additive smoothing
        

    def train(self, X, y):
        alpha_smooth = self._smooth
        cls = np.unique(y)
        Ncls, Nfeat = len(cls), X.shape[1] #Ncls: number of classes, Nfeat: number of features.

        self._cls = cls
        self._prior = np.zeros((1,Ncls))
        #initialize a matrix stand for all p(x|y),the likelyhood for every attribute
        self._likehood = np.zeros((Ncls,Nfeat))
        #for each class,find rows that satisfies the condition,and compute the likehood of each feature
        for i in range(Ncls):
            cla = cls[i]
            x_cla = X[y==cla]                                              #the rows that belong to current class
            self._prior[0,i] =(x_cla.shape[0]+alpha_smooth)/(X.shape[0]+alpha_smooth*Ncls)                    #compute prior probability of current class
            #self._likehood[i] = np.sum(x_cla,axis=0)/x_cla.shape[0]       #verticlly summation along each column to get frequency of each feature and then divide by # of rows

            self._likehood[i,:] = (np.sum(x_cla, axis=0)+alpha_smooth) / (x_cla.shape[0]+ alpha_smooth * 2)                  #apply smooth to frequency divide # of rows
            #print(self._likehood)
            #print(self._prior)

    def train_JM_smooth(self, X, y,X_dataset):
        alpha_smooth = self._smooth
        cls = np.unique(y)
        Ncls, Nfeat = len(cls), X.shape[1] #Ncls: number of classes, Nfeat: number of features.

        size_dataset = X_dataset.shape[0]             #number of data in dataset
        feature_dataset = np.sum(X_dataset,axis=0)    #count the appearnce of each word in dataset by verticlly sum along each column from the dataset


        self._cls = cls
        self._prior = np.zeros((1,Ncls))
        #initialize a matrix stand for all p(x|y),the likelyhood for every attribute
        self._likehood = np.zeros((Ncls,Nfeat))
        #for each class,find rows that satisfies the condition,and compute the likehood of each feature
        for i in range(Ncls):
            cla = cls[i]
            x_cla = X[y==cla]                                              #the rows that belong to current class
            self._prior[0,i] =x_cla.shape[0]/X.shape[0]                    #compute prior probability of current class
            #self._likehood[i] = np.sum(x_cla,axis=0)/x_cla.shape[0]       #verticlly summation along each column to get frequency of each feature and then divide by # of rows
            numerator1 = np.sum(x_cla, axis=0)
            denominator1 = np.sum(x_cla)

            self._likehood[i,:] = (1- alpha_smooth)*numerator1/denominator1 + alpha_smooth * feature_dataset / size_dataset                  #apply JM smooth to frequency divide # of rows

        #after the for loop,self._prior stores the prior probability of all catergories and
        #selef._likehood stores the probabality of all feature  such that  P(feature_i | catergory)
            ###confusion  : smoothing????????
        #self._notlikehood = 1-self._likehood              #P(not feature i | catergory )
    def predict(self, X):

        Ncls,Ntest,Nfeat = len(self._cls),X.shape[0],X.shape[1]                         #number of test sample
        pred = np.zeros(Ntest)
        loglikehood = np.log(self._likehood)       
        X_not = 1-X                   #since in original data,1 for appearence of feature i and 0 for not appear,the logic not means 1 for not appear and 0 for appear

        #X is size Ntest x Nfeat,selef._likehood.T is the shape Nfeat x Ncls
        log_appear = X.dot(np.log(self._likehood.T))
        
        #print("appear",log_appear)
        
        log_absence = X_not.dot (np.log(1-self._likehood.T))
        #print("absence",log_absence)

        log_post = log_appear+log_absence
        #log_post = X.dot(np.log(self._likehood.T)) + X_not.dot (np.log(1-self._likehood.T))    #consider both appearence and absence
        #log_post = X.dot(np.log(self._likehood.T))                                             #only consider appearence

        log_post = log_post + np.log(self._prior.reshape(1,Ncls))
        #print(log_post)
        pred = self._cls[np.argmax(log_post,axis=1)]

        return pred


In [None]:
df = pd.read_csv('/kaggle/input/mbti-type/mbti_1.csv',header=0)
df.head(10)

In [None]:
fig = plt.gcf()
fig.set_size_inches(50, 20)
sns.catplot(x="type", kind="count", data=df,height=8.27, aspect=11.7/8.27)

In [None]:
#
# Just striping the string incase of any whitespace before or after the string
df["type"] = df["type"].str.strip()
# Seperate the label into four different parts
target_multi_label = df["type"].str.split("" , expand=True)
target_multi_label = target_multi_label.iloc[: , 1:-1]
target_multi_label.columns = ["Personality-1","Personality-2","Personality-3","Personality-4"]

df = pd.concat([df,target_multi_label] , axis=1)
'''
personality_map = {
    "I":"Introvert",
    "E":"Extrovert",
    "N":"Intuitive",
    "S":"Sensitive",
    "F":"Emotional",
    "T":"Thinker",
    "J":"Judgemental",
    "P":"Perceiving"
}
for col in df.loc[: , "Personality-1":"Personality-4"].columns:
    df[col] = df[col].map(personality_map)
'''
df.head()


In [None]:
fig = plt.gcf()
fig.set_size_inches(50, 20)
sns.catplot(x="Personality-1", kind="count", data=df,height=5, aspect=4/5)
sns.catplot(x="Personality-2", kind="count", data=df,height=5, aspect=4/5)
sns.catplot(x="Personality-3", kind="count", data=df,height=5, aspect=4/5)
sns.catplot(x="Personality-4", kind="count", data=df,height=5, aspect=4/5)


In [None]:
#version1 of text pre-processing

#source:https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79
!pip install Unidecode
!pip install contractions
!pip install BeautifulSoup4
import nltk
nltk.download('wordnet')

from bs4 import BeautifulSoup
import spacy
import unidecode 
#from word2number import w2n
import contractions
from nltk.stem import WordNetLemmatizer 
import re

def preprocessing_v1(text):
    #remove html information
    soup = BeautifulSoup(text, "html.parser")
    processed = soup.get_text(separator=" ")
    
    #remove http// 
    processed = re.sub(r"http\S+", "", processed)

    #remove ||| seperate
    processed = re.sub(r'\|\|\|', r' ', processed)

    #lower case
    processed = processed.lower()

    #expand shortened words, e.g. don't to do not
    processed = contractions.fix(processed)

    #remove accented char
    processed = unidecode.unidecode(processed)

    #remove white space
    #processed = processed.strip()
    #processed = " ".join(processed.split())

    # Lemmatizing 
    lemmatizer = WordNetLemmatizer() 
    processed=lemmatizer.lemmatize(processed)


    return processed

In [None]:
df['posts'] = df['posts'].apply(preprocessing_v1)
df.head()

In [None]:
#split traning and test data and vectorize them



number_training = 6000
data_size = df['type'].shape[0]



all_data = df['posts'].astype('U').values
data_train = df['posts'][:number_training].astype('U').values
data_test = df['posts'][number_training:].astype('U').values

y_train = df['type'][:number_training].astype('U').values
y_test = df['type'][number_training:].astype('U').values


#Note here, increase max_features may result in increasing ram usage and cause crush of colab
#By defaut,it will geneate over 140000 features without any text preprocessing,it would decrease to near 100000 but still not acceptable
#therefore I added a upper bound for max_features
vectorizer = CountVectorizer(
        lowercase=True, stop_words='english',
        max_df=1.0, min_df=1, max_features=2000,  binary=True
      )
processed_data = vectorizer.fit_transform(all_data).toarray()

X_train = processed_data[0:number_training, :]
X_test = processed_data[number_training:, :]

print("X_train.shape = {}".format(X_train.shape))
print("X_test.shape = {}".format(X_test.shape))

In [None]:
####   tempoary test cell , only used to debug some non-sense
print(X_train)
print(sum(X_train[0]))

In [None]:
#perform naive bayes,predict 1 among 16 personality types at once
clf = MyBayesClassifier(1.0)
clf.train(X_train, y_train);
y_pred = clf.predict(X_test)
print(X_test)
print(y_pred)
print("Absolute accuracy = {}".format(np.mean(y_test==y_pred)))

In [None]:
#for each sub-personality type,train the model and make prediction
#first test I/E,then N/S.......Cancadinate the result together to form final result
y_pred = a2 = np.array(['' for i in range(data_size-number_training)])
clf = MyBayesClassifier(1.0)

for col in df.loc[: , "Personality-1":"Personality-4"].columns:
    y_train_sub = df[col][:6000].astype('U').values
    clf.train(X_train, y_train_sub);
    y_pred_sub = clf.predict(X_test)
    y_test_sub = df[col][6000:].astype('U').values
    print(col, "accuracy = {}".format(np.mean(y_test_sub==y_pred_sub)))


    y_pred=np.core.defchararray.add(y_pred, y_pred_sub)

print(y_pred)
print(y_test)
print("Absolute accuracy = {}".format(np.mean(y_test==y_pred)))

In [None]:
#test the special case when max_feature is None
#print(k_Fold_CV(10, None, 1))

#Conclusion:
#the project will forced to restart due to excessive memory requirement

In [None]:
'''
This will load the csv
'''
class CsvToDf:
    '''
    This class will simply turn the given data to a dataframe
    '''
    def __init__(self,filename,batchSize=None,cols=None):
        #batchSize is the size of data to be read incrementally. This is for data that is to big to fit
        #into memory
        self._cols = cols
        self._header = None
        self._filename = filename
        self._curIndex = 0     #this will be the current index that we are in the csv
        self._isRead = False
        self._df = None
        self._storeHeader()
        self._batchSize = batchSize
    def _storeHeader(self):
        with open(self._filename) as csvFile:
            f = csv.reader(csvFile)
            self._header = next(f)
    def getWholeCsv(self):
        if not(self._isRead):
            if self._cols != None:
                self._df = pd.read_csv(self._filename,usecols=self._cols)
            else:
                self._df = pd.read_csv(self._filename)
            self._isRead = True
        return self._df
    def getHeader(self):
        return self._header
    def _checkIfRead(self):
        if not(self._isRead):
            if self._cols != None:
                self._df = pd.read_csv(self._filename,iterator=True,chunksize=self._batchSize,usecols=self._cols)
            else:
                self._df = pd.read_csv(self._filename,iterator=True,chunksize=self._batchSize)
            self._isRead = True
            return False
        return True
    def getNextBatchCsv(self):
        self._checkIfRead()
        return next(self._df)

In [None]:
'''
test for small Dset

test = CsvToDf("../input/mbti-type/mbti_1.csv",batchSize=200)
print(test.getNextBatchCsv())
res = test.getWholeCsv()
print(type(res))
print(test.getWholeCsv())
print(test.getHeader())
'''

In [None]:
'''
test big dset

test2 = CsvToDf("../input/mbti-full-pull-samplecsv/mbti_full_pull_sample.csv",batchSize=100,cols=['title','type'])
#test2.eliminateCols(['created_utc', 'subreddit', 'author', 'domain', 'url', 'num_comments', 'score', 'ups', 'downs', 'selftext', 'saved', 'id', 'from_kind', 'gilded', 'from', 'stickied', 'retrieved_on', 'over_18', 'thumbnail', 'subreddit_id', 'hide_score', 'link_flair_css_class', 'author_flair_css_class', 'archived', 'is_self', 'from_id', 'permalink', 'name', 'author_flair_text', 'quarantine', 'link_flair_text', 'distinguished'])
print(type(test2.getNextBatchCsv()))
print(test2.getNextBatchCsv())
print(test2.getNextBatchCsv())
print(test2.getHeader())
'''

In [None]:
'''
Turn data into matrix
'''
def formatData(data,label,trainSize):
    vectorizer = CountVectorizer(
        lowercase=True, stop_words='english',
        max_df=1.0, min_df=1, max_features=2000,  binary=True
    )
    out_data = vectorizer.fit_transform(data.astype('U').values).toarray()
    out_label = label.astype('U').values
    return (out_data[:trainSize],out_data[trainSize:],out_label[:trainSize],out_label[trainSize:])

In [None]:
'''
format data test
'''
test = CsvToDf("../input/reddit-data-3/mbti9k_comments.csv",cols=['comment','type'],batchSize=200)
data = test.getNextBatchCsv()
xTrain,xTest,yTrain,yTest = formatData(data['comment'],data['type'],100)
print(xTrain)
print(yTrain)

In [None]:
####Zepeng Xiao Version
###In this version,we only need train data once to capture needed probability
#could change smooth factor by cls._smooth = smoothfactor or update_smooth and call predict method
####
class BayesClassifier_smooth():
    def __init__(self, smooth=1):
        self._smooth = smooth # This is for additive smoothing
    
    def update_smooth(i):
        self._smooth = i
    
    #the train method would only count the probability now
    def train(self, X, y):
        alpha_smooth = self._smooth
        cls = np.unique(y)
        Ncls, Nfeat = len(cls), X.shape[1] #Ncls: number of classes, Nfeat: number of features.

        self._train_size = X.shape[0]                           #store the number of training data
        self._feature_count = np.sum(X,axis=0)                  #count the total appear time of all features(words) by vertically summation every column of training set
        self._cls = cls                                         #store classes for predict use
        self._prior = np.zeros((Ncls,1))                        #initialize a (Ncl * 1) shape matrix to store the count of each label,used to calculate prior probability later
        self._word_count = np.zeros((Ncls,1))                   #initialize the (Ncl *1) matrix to store the total count of appearence of each word given class 

        self._likehood = np.zeros((Ncls,Nfeat))                 #initialize to store for all the count of each word given class
                                                                #the number of rows equal to number of classes,column number equal to number of features(words)
                                                                #therefore it will be used to calculate vectorlized likelyhood p(x|y) later

        #for each class,find rows that satisfies the condition,and capture the count from training set so that it can be used to calculate probability
        for i in range(Ncls):
            cla = cls[i]                                                   #cla <---- current class
            x_cla = X[y==cla]                                              #a subset of the rows that belong to current class
            self._prior[i,0] =x_cla.shape[0]                               #count frequency of current class 

            #verticlly summation along each column to get frequency of each feature appear given current class
            self._likehood[i,:] = np.sum(x_cla, axis=0)

            #sum through the subset training data of current class,count the total number of appearence of all words,used for JMM smoothing later
            self._word_count[i,:] = np.sum(x_cla)


    #this method used for report the predict when using JM smooth approach for pard(d)
    #parameter X is the training data,X_dataset is also used here to calculate the feature appreance count in whole dataset to avoid zero probability
    def predict_JM(self,X,X_dataset):                       
        alpha_smooth = self._smooth                               #the smooth parameter alpha
        Ncls,Ntest,Nfeat = len(self._cls),X.shape[0],X.shape[1]   #Ncls:number of class,Ntest:Number of test data,Nfeat:number of features
        pred = np.zeros(Ntest)                                    #initialized the numpy array of predicted result,its size equals to number of test data
        X_not = np.logical_not(X)                                 #for original data,1 for appearence of feature i and 0 for not appear,its logic not means 1 for not appear and 0 for appear

                                                      #prior and likelyhood(probability) after additive smoothing
        size_dataset = X_dataset.shape[0]             #number of data in dataset
        feature_dataset = np.sum(X_dataset,axis=0)    #count the appearnce of each word in dataset by verticlly sum along each column from the dataset

        #calculte prior probability by divide the count of class in training data by the training size
        prior = self._prior / self._train_size

        #compute the likelyhood probabity such that p(word i exist | class y) using JM smooth
        likelyhood = (1-alpha_smooth)*(self._likehood/self._word_count) + (alpha_smooth) * (feature_dataset / size_dataset)
        
        #apply log transformation of likelyhood P(xi exists | y) and P(xi not exists | y)
        not_likelyhood = np.log(1-likelyhood)
        likelyhood = np.log(likelyhood)

        #apply dot product to obtain the summation of log(P(xi exists | y)) and P(xi not exists | y)
        log_appear = np.dot(X,likelyhood.T)
        log_absence = np.dot (X_not,not_likelyhood.T)

        #calculate log tranformed posterior probability
        log_post = log_appear+log_absence + np.log(prior.reshape(1,Ncls))

        #choose the y such that maximum the postior from the and return it
        pred = np.argmax(log_post,axis=1)


        return pred

    def predict(self, X):
        '''
        This function has to return a numpy array of shape X.shape[0] (i.e. of shape "number of testing examples")
        '''

        alpha_smooth = self._smooth                                              #the smooth parameter alpha
        Ncls,Ntest,Nfeat = len(self._cls),X.shape[0],X.shape[1]                  #Ncls:number of class,Ntest:Number of test data,Nfeat:number of features
        pred = np.zeros(Ntest)                                                   #initialized the numpy array of predicted result,its size equals to number of test data
        X_not = np.logical_not(X)                   #since in original data,1 for appearence of feature i and 0 for not appear,the logic not will give us the 1 for word not appear,and 0 for appear 
                                                    #used for vectorlizd multiplication(dot product) to get posterior
        #calculte prior probability by divide the count of class in training data by the training size
        prior = self._prior / self._train_size


        #compute the likelyhood probabity such that p(word i exist | class y) using additive smooth
        likelyhood = (self._likehood+alpha_smooth)/(self._prior + alpha_smooth*2)      #add alpha to numeriter and 2*alpha(cases of appear or not) to denominator


        #X is in size Ntest x Nfeat, log(likehood.T) is the shape Nfeat x Ncls, each feature has 2 column in this case,each record its likelyhood of given y
        #their product in shape Ntest x Ncls,for each test data,it has the summation of log(P(Xi exist | yi))
      
        log_appear = X.dot(np.log(likelyhood.T))

        #1-likelyhood.T would generate all P(word i does not exist|class y),same size as likelyhood.T
        #X_not dot product the log tranformation likelyhood for not appearence is the summation of log(P(xi does not exist | yi)) 
        log_absence = X_not.dot (np.log(1-likelyhood.T))

        #add them up,it would equal the log transformed likelyhood used for naive bayes
        log_post = log_appear+log_absence
        
        #add the log transformed prior probability,it become log transformed posterior probability
        log_post = log_post + np.log(prior.reshape(1,Ncls))

        #choose the y such that maximum the postior from the and return it
        pred = self._cls[np.argmax(log_post,axis=1)]


        return pred

In [None]:
'''
NB classifier that can train on batches
'''
class BayesClassifier_smooth_I():
    def __init__(self,smooth=1):
        self._smooth = smooth # This is for additive smoothing
        self._cls = ['estj', 'estp', 'esfj', 'esfp', 'entj', 'entp', 'enfj', 'enfp', 'istj', 'istp', 'isfj', 'isfp', 'intj', 'intp', 'infj', 'infp']
        self._prior = np.zeros((16,1))                        #initialize a (Ncl * 1) shape matrix to store the count of each label,used to calculate prior probability later
        self._likehood = None
        self._notInit = True
        self._prob_matrix = None
        #given test dataset we need to reshape likelihood to size of test data
    def update_smooth(i):
        self._smooth = i
        
    #the train method would only count the probability now
    def train(self, X, y):
        cls = np.unique(y)
        Nfeat = X.shape[1] #Nfeat: number of features.
        if self._notInit:
            self._likehood = np.zeros((16,Nfeat))
            self._notInit = False
        for i in range(16):
            cla = self._cls[i]                                                #cla <---- current class
            x_cla = X[y==cla]                                              #a subset of the rows that belong to current class
            self._prior[i,0] += x_cla.shape[0]                               #count frequency of current class 
            #verticlly summation along each column to get frequency of each feature appear given current class
            self._likehood[i,:] += np.sum(x_cla, axis=0)


    #this method used for report the predict when using JM smooth approach for pard(d)
    #parameter X is the training data,X_dataset is also used here to calculate the feature appreance count in whole dataset to avoid zero probability
    def _compute_prediction(self,data_point):
        '''
        precondition: datapoint must be a list of integers that are either 0 or 1. And its length must be the same as the number of features.
        postcondition: no side effects
        this will return an array of 16 floats [0,1] each corresponding to one of the possible personality types
        '''
        data_point_matrix = np.array([data_point,]*self._prob_matrix.shape[0])
        true_matrix = data_point_matrix * self._prob_matrix
        false_matrix = ((data_point_matrix+1)%2) * (1-self._prob_matrix)
        label_array = np.squeeze(np.sum(np.log(true_matrix+false_matrix),axis=1))
        prior_array = np.squeeze(np.log((self._prior+self._smooth)/(16*self._smooth+sum(self._prior))))
        return np.squeeze(label_array+prior_array)
    def predict(self, X):
        '''
        This function has to return a numpy array of shape X.shape[0] (i.e. of shape "number of testing examples")
        '''
        self._prob_matrix = (self._likehood+self._smooth)/(self._prior + self._smooth*2)
        #For each row in X determine the likelihood of it being one of the personalities
        result = np.apply_along_axis(self._compute_prediction,axis=1,arr=X)
        #result will be a matrix that has the shape of (# rows in X,# of possible personalities).
        #each entry will represent the probability of a data point being a specific personality
        pred = np.argmax(result,axis=1)

        return pred
    
    def formatData(data,label,trainSize):
        vectorizer = HashingVectorizer(
        lowercase=True, stop_words='english',binary=True)
        out_data = vectorizer.fit_transform(data.astype('U').values).toarray()
        out_label = label.str.lower().astype('U').values
        return (out_data[:trainSize],out_data[trainSize:],out_label[:trainSize],out_label[trainSize:])
    
    def strLabelToInt(inpLabel):
        out = np.zeros((inpLabel.shape))
        labels = ['estj', 'estp', 'esfj', 'esfp', 'entj', 'entp', 'enfj', 'enfp', 'istj', 'istp', 'isfj', 'isfp', 'intj', 'intp', 'infj', 'infp']
        for idx,i in enumerate(inpLabel):
            if i.lower() in labels:
                out[idx] = labels.index(i)
            else:
                out[idx] = -100
        return out

In [None]:
'''
Naive Bayes Test 1
'''
'''classifier = BayesClassifier_smooth()
test = CsvToDf("../input/mbti-type/mbti_1.csv",batchSize=200,cols=['posts', 'type'])
print(test.getHeader())
data = test.getNextBatchCsv()
xTrain,xTest,yTrain,yTest = formatData(data['posts'],data['type'],100)
yTest = strLabelToInt(yTest)
classifier.train(xTrain,yTrain)
pred = classifier.predict(xTest)
print("accuracy = {}".format(np.mean((yTest-pred)==0))) '''

RNN implementation

In [None]:
full_type_labels = df['type']
# Enumerating the personality types so that our model can work with numbers
personality_dict = {"ENTJ" : 0, "INTJ" : 1, "ENTP" : 2, "INTP" : 3, "INFJ" : 4, "INFP" : 5, "ENFJ" : 6 , 
                    "ENFP" : 7, "ESTP" : 8, "ESTJ" : 9, "ISTP" : 10, "ISTJ" : 11, "ISFJ" : 12, "ISFP" : 13, 
                    "ESFJ" : 14, "ESFP" : 15}

type_labels = []

# Go through the array and turn the personality type into its corresponding number
for idx, personality in enumerate(full_type_labels):
    type_labels.append(personality_dict[personality])

type_labels = np.array(type_labels)

print(type_labels)

In [None]:
'''import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Only considering the top 10000 most common words
vocab_size = 10000
max_length = 2016
# We have 8675 rows
training_size = 8675//2
training_posts = posts[0:training_size]
testing_posts = posts[training_size:]
# Right now is only predicting introversion or extroversion
training_labels = type_labels[0:training_size]
testing_labels = type_labels[training_size:]

# We only want to fit the tokenizer on the training, not the testing
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_posts)

word_index = tokenizer.word_index

# Puts the padding (which are 0) at the end of the vectorized sentence.
# The longest post in our dataset is 2016, but we should truncate='post' earlier than 2016 words
training_sequences = tokenizer.texts_to_sequences(training_posts)
training_padded = pad_sequences(training_sequences, padding = 'post', maxlen = max_length)
# training_sequences = np.array(training_sequences)
training_padded = np.array(training_padded)

testing_sequences = tokenizer.texts_to_sequences(testing_posts)
testing_padded = pad_sequences(testing_sequences, padding = 'post', maxlen=max_length)
# testing_sequences = np.array(testing_sequences)
training_padded = np.array(training_padded)


print(word_index)
print(training_padded[0])
print(training_padded.shape)
'''

In [None]:
# # Commenting out for similar reasons

# #Second parameter is the output dimension. Therefore, when we are changing this to predict 4 dimensions of personality we should change it to 4
# embedding_dim = 1

# model = tf.keras.Sequential([ 
#                              tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#                              tf.keras.layers.GlobalAveragePooling1D(),
#                              tf.keras.layers.Dense(24, activation='relu'),
#                              tf.keras.layers.Dense(1, activation='sigmoid')
# ])

# model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
'''Second parameter is the output dimension. Therefore, when we are changing this to predict 4 dimensions of personality we should change it to 4
# ^^ actually i dont know if that is true
embedding_dim = 256

#Embedding layer will always have vocab_size*embedding_dim parameters. Since vocab_size is 10,000 the number of parameters on this layer will always be large

model = tf.keras.Sequential([ 
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                            #  tf.keras.layers.LSTM(100),
                             tf.keras.layers.GlobalAveragePooling1D(),
                            #tf.keras.layers.Dense(1000, activation='relu'),
                            # tf.keras.layers.Dense(400, activation='relu'),
                             tf.keras.layers.Dense(128, activation='relu'),
                             tf.keras.layers.Dense(48, activation='relu'),
                             tf.keras.layers.Dense(16, activation='softmax')
])

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
'''

In [None]:
#model.summary()

In [None]:
'''num_epochs = 10

history = model.fit(training_padded, training_labels, epochs = num_epochs, validation_data=(testing_padded, testing_labels), verbose = 1)'''

K_Fold_CV

In [None]:
#this one is the final version of the k_Fold_CV method
def k_Fold_CV(k, maxFeature):
    all_data = df['posts'].astype('U').values
    all_type = df['type'].astype('U').values
    #use int() to eliminate decimals
    data_fragment_size = int(len(all_data) / k)
    #we vectorize the data first, but due to the ram overload, it is still an issue to be resolved
    #for now, let's keep the max_feature limit
    #the all_data should have the size of 2000 for now
    vectorizer = CountVectorizer(
        lowercase=True, stop_words=None,
        max_df=1.0, min_df=1, max_features=maxFeature,  binary=True
    )
    processed_data = vectorizer.fit_transform(all_data).toarray()
    accuracyList = []

    for i in range(0, k):
        lower_bound = i * data_fragment_size
        upper_bound = lower_bound + data_fragment_size
        #split the data into training and testing based on k
        #this part is just the modified version of the normal test part written by Zepeng Xiao
        validationSize = int((len(all_data) - data_fragment_size) / 4)
        
        if i == 0:
            y_validate = df['type'][upper_bound:upper_bound+validationSize].astype('U').values
            x_validate = processed_data[upper_bound:upper_bound+validationSize, :]
            y_train = df['type'][upper_bound+validationSize:].astype('U').values
            x_train = processed_data[upper_bound+validationSize:, :]
            
        else:
            y_validate = all_type[np.r_[:validationSize]]
            x_validate = processed_data[np.r_[:validationSize]]
            y_train = all_type[np.r_[validationSize:lower_bound, upper_bound:]]
            x_train = processed_data[np.r_[validationSize:lower_bound, upper_bound:]]

        y_test = df['type'][lower_bound:upper_bound].astype('U').values
        x_test = processed_data[lower_bound:upper_bound]
        
        maxAccuracy = 0
        bestsf = 0
        #validation
        for i in range(1, 21, 1):
            sf = i/10
            NBS = BayesClassifier_smooth(sf)
            NBS.train(x_train, y_train)
            y_predict = NBS.predict(x_validate)
            accuracy = np.mean(y_validate==y_predict)
            if accuracy > maxAccuracy: #find the smoothing Factor with the best performance
                bestsf = sf

        #feed the data to the model with the best smooth factor obtained from above and get the results
        #first test the normal NB
        #then test the smoothed NB

        NBS = BayesClassifier_smooth(bestsf)
        NBS.train(x_train, y_train)
        y_predict = NBS.predict(x_test)
        #print("Absolute accuracy = {}".format(np.mean(y_test==y_predict)))
        accuracy = np.mean(y_test==y_predict)
        accuracyList.append(accuracy)
        
    return accuracyList

In [None]:
k_Fold_CV(2, maxFeature)

In [None]:
#this one is the smoothFactor specifiable version of k_Fold_CV method
def k_Fold_CV_sf(k, maxFeature, smoothFactor):
    all_data = df['posts'].astype('U').values
    all_type = df['type'].astype('U').values
    #use int() to eliminate decimals
    data_fragment_size = int(len(all_data) / k)
    #we vectorize the data first, but due to the ram overload, it is still an issue to be resolved
    #for now, let's keep the max_feature limit
    #the all_data should have the size of 2000 for now
    vectorizer = CountVectorizer(
        lowercase=True, stop_words=None,
        max_df=1.0, min_df=1, max_features=maxFeature,  binary=True
    )
    processed_data = vectorizer.fit_transform(all_data).toarray()
    accuracyList = []

    for i in range(0, k):
        lower_bound = i * data_fragment_size
        upper_bound = lower_bound + data_fragment_size
        #split the data into training and testing based on k
        #this part is just the modified version of the normal test part written by Zepeng Xiao
        
        if i == 0:
            y_train = df['type'][upper_bound:].astype('U').values
            x_train = processed_data[upper_bound:, :]
            
        else:
            y_train = all_type[np.r_[:lower_bound, upper_bound:]]
            x_train = processed_data[np.r_[:lower_bound, upper_bound:]]

        y_test = df['type'][lower_bound:upper_bound].astype('U').values
        x_test = processed_data[lower_bound:upper_bound]

        #feed the data to the model and get the results
        #first test the normal NB
        #then test the smoothed NB
        
        NBS = BayesClassifier_smooth(smoothFactor)
        NBS.train(x_train, y_train)
        y_predict = NBS.predict(x_test)
        #print("Absolute accuracy = {}".format(np.mean(y_test==y_predict)))
        accuracy = np.mean(y_test==y_predict)
        accuracyList.append(accuracy)
        
    return accuracyList

#test for different smooth factors
avgList = []
maxList = []
smoothFactorList = []
for smoothFactor in range(0, 51, 1):
    smoothFactor = smoothFactor / 10
    print("smooth factor:", smoothFactor)
    accuracyList = k_Fold_CV_sf(10, 2000, smoothFactor)
    avgAccuracy = np.average(accuracyList)
    maxAccuracy = np.max(accuracyList)
    avgList.append(avgAccuracy)
    maxList.append(maxAccuracy)
    smoothFactorList.append(smoothFactor)
    print(accuracyList, maxAccuracy)
    
plt.title("Average Accuracy vs Smooth Factor")
plt.xlabel("smooth factor")
plt.ylabel("average accuracy")
plt.plot(smoothFactorList, avgList)
plt.show()

plt.title("Maximum Accuracy vs Smooth Factor")
plt.xlabel("smooth factor")
plt.ylabel("maximum accuracy")
plt.plot(smoothFactorList, maxList)
plt.show()

#test for different max features
avgList = []
maxList = []
maxFeatureList = []
for maxFeature in range(1500, 10001, 500):
    accuracyList = k_Fold_CV_sf(3, maxFeature)
    avgAccuracy = np.average(accuracyList)
    maxAccuracy = np.max(accuracyList)
    avgList.append(avgAccuracy)
    maxList.append(maxAccuracy)
    maxFeatureList.append(maxFeature)
    
plt.title("Average Accuracy vs Maximum Feature")
plt.xlabel("maximum feature")
plt.ylabel("average accuracy")
plt.plot(maxFeatureList, avgList)
plt.show()

plt.title("Maximum Accuracy vs Maximum Feature")
plt.xlabel("maximum feature")
plt.ylabel("maximum accuracy")
plt.plot(maxFeatureList, maxList)
plt.show()

In [None]:
print("k = 2: ", k_Fold_CV(2, 2000))
print("k = 3: ", k_Fold_CV(3, 2000))
print("k = 5: ", k_Fold_CV(5, 2000))
print("k = 10: ", k_Fold_CV(10, 2000))

In [None]:
#modifying the maxFeatures
avgList = []
maxList = []
maxFeatureList = []
for maxFeature in range(2000, 10001, 1000):
    accuracyList = k_Fold_CV(3, maxFeature)
    avgAccuracy = np.average(accuracyList)
    maxAccuracy = np.max(accuracyList)
    avgList.append(avgAccuracy)
    maxList.append(maxAccuracy)
    maxFeatureList.append(maxFeature)

In [None]:
plt.title("Average Accuracy vs Maximum Feature")
plt.xlabel("maximum feature")
plt.ylabel("average accuracy")
plt.plot(maxFeatureList, avgList)
plt.show()

plt.title("Maximum Accuracy vs Maximum Feature")
plt.xlabel("maximum feature")
plt.ylabel("maximum accuracy")
plt.plot(maxFeatureList, maxList)
plt.show()