In [1]:
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np 

In [2]:
import nltk
import re
import pickle
import gc
import spacy
import en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.max_length = 1308656
from collections import Counter

In [3]:
df = pd.read_csv("mbti_1.csv")
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [5]:
df["single_posts"] = df.posts.apply(lambda x: np.random.choice(x.split("|||")))

In [6]:
posts = df.single_posts
len(posts)

8675

In [7]:
df2 = df.loc[:, ["type", "single_posts"]]
# del df
# gc.collect()

Clean Data using lambda

In [8]:
df2['clean_text'] = (df2['single_posts'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x).lower()))
df2.clean_text.head(30)

0     basically come up with three items you ve dete...
1      d i m a nightowl  i wake up between    pm and...
2     oh  i didn t know that   what a pity   why not...
3     just stab in the dark here  i don t think that...
4                           what s this we thing about 
5                                                      
6     mental health is such a in thing nowadays a lo...
7     what do you mean by humanities  i like to thin...
8     frustration got the best of me today  and has ...
9     i cannot remember the last time i was this att...
10    i think the trade i m talking about is an se d...
11          https   www youtube com watch v plaaikvhvzs
12                                   some sort of isxx 
13    i agree with chanteuse  personally  it would d...
14    i m not sure about social protocol in the area...
15    my intuition usually allows me to remember suc...
16    roughly     of my friends that know their type...
17    i knew a guy that was homeless  he was one

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8675 non-null   object
 1   single_posts  8675 non-null   object
 2   clean_text    8675 non-null   object
dtypes: object(3)
memory usage: 203.4+ KB


In [10]:
tokens = nlp(''.join(str(df2.clean_text.tolist())))

In [11]:
len(tokens)

297048

In [12]:
from spacy.lang.en.stop_words import STOP_WORDS

In [13]:
stopwords = nlp.Defaults.stop_words


In [14]:
tokens2= [word for word in tokens if not word in stopwords]

In [15]:
items = [x.text for x in tokens2]
Counter(items).most_common(20)

[(' ', 20425),
 ("'", 17305),
 ('i', 13049),
 ('  ', 9463),
 (',', 8674),
 ('the', 6289),
 ('to', 6102),
 ('a', 5049),
 ('and', 4960),
 ('you', 3890),
 ('of', 3732),
 ('it', 3725),
 ('that', 3385),
 ('is', 2822),
 ('in', 2779),
 ('my', 2648),
 ('   ', 2457),
 ('t', 2275),
 ('s', 2275),
 ('but', 1904)]

In [16]:
MBTI_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
              'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ',
              'MBTI']
MBTI_types = [ti.lower() for ti in MBTI_types] + [ti.lower() + 's' for ti in MBTI_types]

In [17]:
words = [ti for ti in tokens2 if not ti.like_num]
words = [ti for ti in tokens2 if not ti.like_url]
words = [ti for ti in tokens2 if not ti.like_email]
words = [ti for ti in tokens2 if ti.lower_ not in MBTI_types]

In [18]:
words = [token.lemma_ for token in tokens2
            if not token.is_stop and not token.is_punct and not token.is_space]

In [19]:
items2 = [x for x in words if len(x)>2]
Counter(items2).most_common(20)

[('like', 1522),
 ('think', 1304),
 ('don', 1007),
 ('people', 985),
 ('know', 917),
 ('thing', 744),
 ('feel', 701),
 ('good', 677),
 ('time', 646),
 ('type', 542),
 ('want', 539),
 ('love', 499),
 ('com', 497),
 ('watch', 448),
 ('find', 444),
 ('friend', 439),
 ('way', 415),
 ('try', 414),
 ('lot', 389),
 ('get', 375)]

In [20]:
len(items2)

93532

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf=TfidfVectorizer()
text_tf= tf.fit_transform(items2)

In [22]:
tf_df2 = pd.DataFrame(text_tf.toarray(), columns=tf.get_feature_names())

In [23]:
sentiments = [((TextBlob(phrase).sentiment.polarity)) for phrase in items2 ]

tf_df2["Sentiment"] = sentiments

In [24]:
tf_df2.head()

Unnamed: 0,aaaa,aaaaaaaaa,aaaaaaaaaarrgggh,aaaaaaaaacw,aaaaaaaaaie,aaaaaaaabwq,aaaaaaaafug,aaaaaand,aaaah,aaah,...,zun,zuqk,zww,zwxs,zxkbd,zyjl,zylinder,zzlvav,zzzzz,Sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
tf_df2.shape

(93532, 12182)

In [26]:
tf_df2.Sentiment.describe()

count    93532.000000
mean         0.020616
std          0.182925
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Sentiment, dtype: float64

In [27]:
tf_df2["_target"] = df2["type"]

In [28]:
tf_df2.head()

Unnamed: 0,aaaa,aaaaaaaaa,aaaaaaaaaarrgggh,aaaaaaaaacw,aaaaaaaaaie,aaaaaaaabwq,aaaaaaaafug,aaaaaand,aaaah,aaah,...,zuqk,zww,zwxs,zxkbd,zyjl,zylinder,zzlvav,zzzzz,Sentiment,_target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,INFJ
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENTP
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,INTP
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,INTJ
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENTJ


In [None]:
tf_df2.to_json('tf_df2.json', orient='records')