In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#imports
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.tokenize import RegexpTokenizer
import string 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, RobustScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from collections import Counter
import wordcloud
pd.set_option('max_rows',1000000)
pd.set_option('max_columns',10000)
sns.set(rc={'figure.figsize':(15,10)})
import spacy

In [None]:
df = pd.read_csv('/kaggle/input/mbti-type/mbti_1.csv')
df.head()

In [None]:
df.info()

# EDA

In [None]:
gr = df.groupby('type').count()
gr.sort_values("posts", ascending=False, inplace=True)
gr

lets plot the count.

In [None]:
plt.figure(figsize=(15,10))
gr['posts'].plot(kind='bar',title="Number of Posts per Personality type")

In [None]:
sns.barplot(x=gr.index,y='posts',data=gr,palette='rocket')
plt.title('Number of Posts per Personality type',fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

lets see the distribution of length of all posts...

In [None]:
df["LenP"] = df["posts"].apply(len)
sns.distplot(df["LenP"]).set_title("Distribution of Lengths of all 50 Posts");

plotting the number of posts per user.

In [None]:
df["NumPosts"] = df["posts"].apply(lambda x: len(x.split("|||")))

sns.distplot(df["NumPosts"], kde=False).set_title("Number of Posts per User")

In [None]:
df.head()

In [None]:
#Split to posts
def extract(posts, new_posts):
    for post in posts[1].split("|||"):
        new_posts.append((posts[0], post))

posts = []
df.apply(lambda x: extract(x, posts), axis=1)
print("Number of users", len(df))
print("Number of posts", len(posts))

In [None]:
posts

In [None]:
new_df = pd.DataFrame(posts, columns=["type", "posts"])
new_df.head(100)

Finding the most common words in all posts.

In [None]:
words = list(new_df["posts"].apply(lambda x: x.split()))
words = [x for y in words for x in y]
Counter(words).most_common(40)

plotting the most common words with WordCloud.

In [None]:
wc = wordcloud.WordCloud(width=1200, height=500, 
                         collocations=False, background_color="white", 
                         colormap="tab20b").generate(" ".join(words))
plt.figure(figsize=(25,10))
plt.imshow(wc, interpolation='bilinear')
_ = plt.axis("off")

In [None]:
def preprocess_text(df, remove_special=True):
    #Remove links 
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', x.replace("|"," ") + " "))
    
    #Keep EOS
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'\.', ' EOSTokenDot ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'\?', ' EOSTokenQuest ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'!', ' EOSTokenExs ', x + " "))
    
    #Strip Punctation
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^\w\s]','',x))

    #Remove Non-words
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))

    #To lower
    df["posts"] = df["posts"].apply(lambda x: x.lower())

    #Remove multiple letter repating words
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'([a-z])\1{2,}[\s|\w]*','',x)) 

    #Remove short/long words
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{0,3})?\b','',x)) 
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{30,1000})?\b','',x))

    #Remove Personality Types Words
    #This is crutial in order to get valid model accuracy estimation for unseen data. 
    if remove_special:
        pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
        pers_types = [p.lower() for p in pers_types]
        p = re.compile("(" + "|".join(pers_types) + ")")

    df["posts"] = df["posts"].apply(lambda x: p.sub(' PTypeToken ',x))
    return df

In [None]:
#Preprocess Text
#new_df = preprocess_text(new_df)

# MODELLING

In [None]:
#Remove posts with less than X words
min_words = 15
print("Number of posts", len(new_df)) 
new_df["nw"] = new_df["posts"].apply(lambda x: len(re.findall(r'\w+', x)))
new_df = new_df[new_df["nw"] >= min_words]
print("Number of posts", len(new_df))

encoding the personality types,

In [None]:
enc = LabelEncoder()
new_df['type_enc'] = enc.fit_transform(new_df['type'])
target = new_df['type_enc']
target.head()

In [None]:
new_df.head(100)

In [None]:
nlp = spacy.load('en_core_web_lg')
#train_vector = np.array([nlp(text).vector for text in new_df.posts])
#print(train_vector.shape)

vectorizing the posts for the model

In [None]:
vect = CountVectorizer(stop_words='english') 
train =  vect.fit_transform(new_df["posts"])

In [None]:
train.shape

Splitting the data into train and test sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.1, stratify=target, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Training the model.

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
accuracy_score(y_test,xgb.predict(X_test))