# MBTI Notebook

In [71]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import re
import os
import time
import random
import asyncio

from urlextract import URLExtract
from googletrans import Translator

In [72]:
df = pd.read_csv('../dataset/mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [74]:
df.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,'It has been too long since I have been on per...
freq,1832,1


In [75]:
df['posts'][0]

"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

## Data pre-processing

## remove all url from the posts

In [76]:
def remove_url(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    
    for url in urls:
        text = text.replace(url, ' ')

    return text

In [77]:
# save or run is not availabe
if not os.path.exists('../dataset/preprocessed/mbti_no_urls.csv'):
    df['posts'] = df['posts'].apply(remove_url)
    df.to_csv('../dataset/preprocessed/mbti_no_urls.csv', index=False, header=True)
else:
    df = pd.read_csv('../dataset/preprocessed/mbti_no_urls.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,' and intj moments sportscenter not top t...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,"'Good one _____ course, to which I say I ..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


### Reload Dataset and strip off pipe characters and Special Characters

In [78]:
df_no_urls = pd.read_csv('../dataset/preprocessed/mbti_no_urls.csv')

In [79]:
def strip_pipe(text):
    return re.sub(r"\|", "", text)

In [80]:
df_no_urls['posts'] = df_no_urls['posts'].apply(strip_pipe)

df_no_urls.head()

Unnamed: 0,type,posts
0,INFJ,' and intj moments sportscenter not top t...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,"'Good one _____ course, to which I say I ..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.That's another silly misconcepti...


In [81]:
def strip_special_characters(text):
    text = re.sub(r"[^\w\s]|_", " ", text)
    return text

In [82]:
df_no_urls['posts'] = df_no_urls['posts'].apply(strip_special_characters)
df_no_urls.to_csv('../dataset/preprocessed/mbti_no_urls_cleaned.csv', index=False, header=True)
df_no_urls.head()

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top t...
1,ENTP,I m finding the lack of me in these posts ver...
2,INTP,Good one course to which I say I ...
3,INTJ,Dear INTP I enjoyed our conversation the o...
4,ENTJ,You re fired That s another silly misconcepti...


### Perform Lemmatization on post

Convert all English words into their base words. For example eating, ate and eatten become eat

In [83]:
nlp  = spacy.load("en_core_web_sm")


def lemmatize_posts(text, allow_stop_words=False):
    doc = nlp(text)
    lemmatized_words = []
    
    for token in doc:
        if (not allow_stop_words and token.is_stop) or token.is_punct:
            continue
        lemmatized_words.append(token.lemma_)
        
    return " ".join(lemmatized_words)

In [84]:
if not os.path.exists('../dataset/preprocessed/mbti_lemmatized.csv'):
    df_lemmatized = pd.read_csv('../dataset/preprocessed/mbti_no_urls_cleaned.csv')
    df_lemmatized['posts'] = df_lemmatized['posts'].apply(lemmatize_posts)
    df_lemmatized.to_csv('../dataset/preprocessed/mbti_lemmatized.csv', index=False, header=True)
else:
    df_lemmatized = pd.read_csv('../dataset/preprocessed/mbti_lemmatized.csv')
    
df_lemmatized

Unnamed: 0,type,posts
0,INFJ,intj moment sportscenter play pr...
1,ENTP,m find lack post alarming Sex boring s posit...
2,INTP,good course know s blessing ...
3,INTJ,Dear INTP enjoy conversation day esot...
4,ENTJ,fire s silly misconception approach logica...
...,...,...
8670,ISFP,think cat Fi dom reason website have...
8671,ENFP,thread exist someplace heck delete...
8672,INTP,question thing purple pill pick win lot...
8673,INFP,conflicted right come want child honestly...
