This notebook trying to do Classification of MBTI type by text. 
There would be 
1. Data Visualization
2. Text Preprocessing 
3. Converting Text to numbers
4. Training and Test Sets
5. Training Test Classification Model and Predicting Sentiment
6. Evaluating the Model 

The data actually are not equal distributed from each types. Therefore, I will only use 4 types that representing 2 Extroverted and 2 Introverted. 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
from matplotlib.pyplot import *
import seaborn as sns
import plotly.express as px
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/mbti-type/mbti_1.csv')
df.head()

In [None]:
df.info()

In [None]:
a = df.sort_values(by=["type"], ascending=True, inplace=False)
a

## Data Visualization

In this section, I will visualize all data and then scrap 4 types that equally distributed to be used in this notebook. 

In [None]:
fig = plt.gcf()
fig.set_size_inches(20, 5)
sns.countplot(x="type", data = df)

In [None]:
grouped_data = df.groupby(['type']).size().reset_index()
grouped_data.columns = ['type','Count']
grouped_data = grouped_data.sort_values('Count', ascending = True)
fig = px.bar(grouped_data, x = 'type', y = 'Count', title = 'Distribution Of Each Types')
fig.show()

In [None]:
a = df.groupby('type').agg({'type':'count'})
a

This is the process of dropping down the data that we would not use.

In [None]:
b = df[~(df.type.isin(['ESTJ','ESFJ','ESFP', 'ESTP', 'ISFP', 'ISTP', 'ENFP','ENTP','INTJ','INTP','INFJ','INFP']))]
b

In [None]:
grouped_data = b.groupby(['type']).size().reset_index()
grouped_data.columns = ['type','Count']
grouped_data = grouped_data.sort_values('Count', ascending = True)
fig = px.bar(grouped_data, x = 'type', y = 'Count', title = 'Distribution of ISFJ, ENFJ, ISTJ, ENTJ')
fig.show()

In [None]:
def var_row(row):
    l = []
    for i in row.split('|||'):
        l.append(len(i.split()))
    return np.var(l)

b['words_per_comment'] = b['posts'].apply(lambda x: len(x.split())/50)
b['variance_of_word_counts'] = b['posts'].apply(lambda x: var_row(x))
b.head()

In [None]:
plt.figure(figsize=(15,10))
sns.swarmplot("type", "words_per_comment", data=b)

## **Text Preprocessing**

In this section, remove all the special characters, numbers, and unwanted spaces from our text.

In [None]:
##### Compute list of subject with Type | list of comments 
from nltk.stem import PorterStemmer, WordNetLemmatizer

unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
  
unique_type_list = [x.lower() for x in unique_type_list]

# Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

def pre_process_data(b, remove_stop_words=True):

    list_personality = []
    list_posts = []
    len_data = len(b)
    i=0
    
    for row in b.iterrows():
        i+=1
        if i % 500 == 0:
            print("%s | %s rows" % (i, len_data))

        ##### Remove and clean comments
        posts = row[1].posts
        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'link', posts)
        temp = re.sub("[^a-zA-Z]", " ", temp)
        temp = re.sub(' +', ' ', temp).lower()
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])

        type_labelized = lab_encoder.transform([row[1].type])[0]
        list_personality.append(type_labelized)
        list_posts.append(temp)

    #del data
    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    return list_posts, list_personality

list_posts, list_personality = pre_process_data(b, remove_stop_words=True)

In [None]:
print("Num posts and personalities: ",  list_posts.shape, list_personality.shape)

In [None]:
list_posts[0]

## **Converting Text to Numbers**

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

# Posts to a matrix of token counts
cntizer = CountVectorizer(analyzer="word", 
                             max_features=1500, 
                             tokenizer=None,    
                             preprocessor=None, 
                             stop_words=None,  
                             max_df=0.7,
                             min_df=0.1) 

# Learn the vocabulary dictionary and return term-document matrix
print("CountVectorizer...")
X_cnt = cntizer.fit_transform(list_posts)

# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

print("Tf-idf...")
# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
X_tfidf =  tfizer.fit_transform(X_cnt).toarray()

In [None]:
feature_names = list(enumerate(cntizer.get_feature_names()))
feature_names

## **Text Preprocessing Version 2**

In [None]:
def preprocess_inputs(df):
    
    texts = b['posts'].copy()
    labels = b['type'].copy()
    
    # Process text data
    stop_words = stopwords.words('english')
    
    texts = [text.lower() for text in texts]
    texts = [text.split() for text in texts]
    texts = [[word.strip() for word in text] for text in texts]
    texts = [[word for word in text if word not in stop_words] for text in texts]
    
    vocab_length = 10000
    
    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)
    
    texts = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max([len(text) for text in texts])
    
    texts = pad_sequences(texts, maxlen=max_seq_length, padding='post')
    
    # Process label data
    label_values = ['ENTJ', 'ENFJ', 'ISFJ', 'ISTJ']
    
    label_mapping = {label: np.int(label[0] == 'E') for label in label_values}
    
    labels = labels.replace(label_mapping)
    labels = np.array(labels)
    
    return texts, labels, max_seq_length, vocab_length, label_mapping

In [None]:
texts, labels, max_seq_length, vocab_length, label_mapping = preprocess_inputs(b)

In [None]:
print("Text sequences:\n", texts.shape)
print("\nLabels:\n", labels.shape)
print("\nMax sequence length:\n", max_seq_length)
print("\nVocab length:\n", vocab_length)
print("\nLabel mapping:\n", label_mapping)

## **Training and Testing Sets**

In [None]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, train_size=0.7, random_state=123)

In [None]:
texts

## **Training Text Classification Model and Predicting Sentiment**

In [None]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(texts_train, labels_train) 

In [None]:
labels_pred = classifier.predict(texts_test)

## **Evaluating The Model**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(labels_test,labels_pred))
print(classification_report(labels_test,labels_pred))
print(accuracy_score(labels_test, labels_pred))

The accuracy only 45%. Let's try with another RNN using TensorFlow

**RNN using TensorFlow**

In [None]:
embedding_dim = 512

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)

gru = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(
        units=256,
        return_sequences=True
    )
)(embedding)

flatten = tf.keras.layers.Flatten()(gru)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)


model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


history = model.fit(
    texts_train,
    labels_train,
    validation_split=0.2,
    batch_size=32,
    epochs=5,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint('./model.h5', save_best_only=True, save_weights_only=True)
    ]
)

In [None]:
model.load_weights('./model.h5')

In [None]:
model.evaluate(texts_test, labels_test)

The accuracy using RNN is 68%