In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

  import pandas.util.testing as tm


In [3]:
df = pd.read_csv('/data.csv')

# Labels that need to be removed from posts
lbl_rmv=list(df['type'].unique())
lbl_rmv = [item.lower() for item in lbl_rmv]

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from nltk.stem.porter import PorterStemmer

for i in range(0,8675) :  
    df['posts'][i] = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', df['posts'][i])
    df['posts'][i] = re.sub("[^a-zA-Z]", " ", df['posts'][i])
    df['posts'][i] = re.sub(' +', ' ', df['posts'][i]).lower()
    for j in range(0,16):
        df['posts'][i]=re.sub(lbl_rmv[j], ' ', df['posts'][i])
        
df['posts'] = df['posts'].str.strip()

In [5]:
def pre_process(post):
    posts = re.sub('\s+', ' ', post)
    posts = posts.lower()
    posts = posts.split()
    posts = [word for word in posts if not word in set(stopwords.words('english'))]
    ps = PorterStemmer()
    posts = [ps.stem(word) for word in posts]
    posts = ' '.join(posts)
    return posts
    
corpus = df["posts"].apply(pre_process)

In [6]:
df.head()

Unnamed: 0,type,posts
0,INFJ,and moments sportscenter not top ten plays p...
1,ENTP,i m finding the lack of me in these posts very...
2,INTP,good one of course to which i say i know that ...
3,INTJ,dear i enjoyed our conversation the other da...
4,ENTJ,you re fired that s another silly misconceptio...


In [7]:
corpus

0       moment sportscent top ten play prank life chan...
1       find lack post alarm sex bore posit often exam...
2       good one cours say know bless curs absolut pos...
3       dear enjoy convers day esoter gab natur univer...
4       fire anoth silli misconcept approach logic go ...
                              ...                        
8670    ixfp alway think cat fi dom reason especi webs...
8671    thread alreadi exist someplac els post heck de...
8672    mani question thing would take purpl pill pick...
8673    conflict right come want children honestli mat...
8674    long sinc personalitycaf although seem chang o...
Name: posts, Length: 8675, dtype: object

In [8]:
type(corpus)

pandas.core.series.Series

In [9]:
df_new = corpus.to_frame()

In [11]:
df_new['labels']=df['type']
df_new.head()

Unnamed: 0,posts,labels
0,moment sportscent top ten play prank life chan...,INFJ
1,find lack post alarm sex bore posit often exam...,ENTP
2,good one cours say know bless curs absolut pos...,INTP
3,dear enjoy convers day esoter gab natur univer...,INTJ
4,fire anoth silli misconcept approach logic go ...,ENTJ


In [12]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   8675 non-null   object
 1   labels  8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [14]:
map1 = {"I": 0, "E": 1}
map2 = {"N": 0, "S": 1}
map3 = {"T": 0, "F": 1}
map4 = {"J": 0, "P": 1}
df_new['I-E'] = df_new['labels'].astype(str).str[0]
df_new['I-E'] = df_new['I-E'].map(map1)
df_new['N-S'] = df_new['labels'].astype(str).str[1]
df_new['N-S'] = df_new['N-S'].map(map2)
df_new['T-F'] = df_new['labels'].astype(str).str[2]
df_new['T-F'] = df_new['T-F'].map(map3)
df_new['J-P'] = df_new['labels'].astype(str).str[3]
df_new['J-P'] = df_new['J-P'].map(map4)

In [15]:
df_new.head()

Unnamed: 0,posts,labels,I-E,N-S,T-F,J-P
0,moment sportscent top ten play prank life chan...,INFJ,0,0,1,0
1,find lack post alarm sex bore posit often exam...,ENTP,1,0,0,1
2,good one cours say know bless curs absolut pos...,INTP,0,0,0,1
3,dear enjoy convers day esoter gab natur univer...,INTJ,0,0,0,0
4,fire anoth silli misconcept approach logic go ...,ENTJ,1,0,0,0


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
features = cv.fit_transform(df_new['posts']).toarray()
IE = df_new.iloc[:, 2].values
NS = df_new.iloc[:, 3].values
TF = df_new.iloc[:, 4].values
JP = df_new.iloc[:, 5].values

In [21]:
from sklearn.model_selection import train_test_split
features_train, features_test, IE_train, IE_test, NS_train, NS_test, TF_train, TF_test, JP_train, JP_test = train_test_split(features, IE,NS,TF,JP, test_size = 0.20, random_state = 0)

In [22]:
from xgboost import XGBClassifier

# fit model on training data
IEB = XGBClassifier()
IEB.fit(features_train, IE_train)
ieb_train=IEB.score(features_train,IE_train)
ieb_test=IEB.score(features_test,IE_test)

NSB = XGBClassifier()
NSB.fit(features_train, NS_train)
nsb_train=NSB.score(features_train,NS_train)
nsb_test=NSB.score(features_test,NS_test)


TFB = XGBClassifier()
TFB.fit(features_train, TF_train)
tfb_train=TFB.score(features_train,TF_train)
tfb_test=TFB.score(features_test,TF_test)


JPB = XGBClassifier()
JPB.fit(features_train, JP_train)
jpb_train=JPB.score(features_train,JP_train)
jpb_test=JPB.score(features_test,JP_test)

In [23]:
print('I-E train score is :',ieb_train)
print('I-E test score is :',ieb_test)
print('N-S train score is :',nsb_train)
print('N-S test score is :',nsb_test)
print('T-F train score is :',tfb_train)
print('T-F test score is :',tfb_test)
print('J-P train score is :',jpb_train)
print('J-P test score is :',jpb_test)

I-E train score is : 0.8096541786743516
I-E test score is : 0.7740634005763689
N-S train score is : 0.8661383285302594
N-S test score is : 0.8778097982708933
T-F train score is : 0.8239193083573487
T-F test score is : 0.7515850144092219
J-P train score is : 0.7438040345821325
J-P test score is : 0.652449567723343


In [34]:
def convert(post):
  # print("1="+post)

  post = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', post)
  posts = re.sub("[^a-zA-Z]", " ", post)
  post = re.sub(' +', ' ', post).lower()
  # print("2="+post)
  for j in range(0,16):
    post=re.sub(lbl_rmv[j], ' ', post)
        
  post = post.strip()
  # print("3="+post)

  post = re.sub('\s+', ' ', post)
  post = post.lower()
  post = post.split()
  # print("1="+post)
  post = [word for word in post if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  post = [ps.stem(word) for word in post]
  post = ' '.join(post)

  return post

In [25]:
df_new.iloc[6:12,:]

Unnamed: 0,posts,labels,I-E,N-S,T-F,J-P
6,draw nail haha done profession nail ye gel mea...,INFJ,0,0,1,0
7,tend build collect thing desktop use frequent ...,INTJ,0,0,0,0
8,sure good question distinct two depend percept...,INFJ,0,0,1,0
9,posit actual let go person due variou reason u...,INTP,0,0,0,1
10,one time parent fight dad affair dad push mom ...,INFJ,0,0,1,0
11,went break month ago togeth year plan life aro...,ENFJ,1,0,1,0


In [47]:
post = df['posts'][8]

In [48]:
post = convert(post)
post = cv.transform([post]).toarray()

In [49]:
post

array([[0, 0, 0, ..., 0, 0, 0]])

In [50]:
IEB.predict(post)

array([0])

In [51]:
JPB.predict(post)

array([1])

In [52]:
TFB.predict(post)

array([1])

In [53]:
NSB.predict(post)

array([0])