# Importing libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
import pickle
import os.path
import plotly.offline as pyo
import plotly.graph_objs as go
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
#

# Context

MBTI (Myers-Briggs Type Indicator) is an introspective self-report questionnaire indicating differing psychological preferences (cognitive functions) in how people perceive the world and make decisions

This study was made based on the kaggle dataset https://www.kaggle.com/zeyadkhalid/mbti-personality-types-500-dataset

# Content

    ~106K records of preprocessed posts and their authors' personality types.
    
    Posts are equal-sized: 500 words per sample   

# About the dataset

Posts are preprocessed texts:

    - No punctuations, stopwords, URLs
    
    - Lemmatization
    
    - Reconstruct samples to be equal-sized chunks (500 words per sample)
    
Personality types are 16 unique values

# Stop words

As said, this dataset doesn't has any stop words

"Stop words" are words that appears so frequently that don't require tagging as thoroughly as nouns, verbs and modifiers

Let's use the library Spacy to see examples of english stop words

Spacy is the Industrial Strength Natural Language Processing: https://spacy.io/

In [5]:
# Load one of the availables trained pipelines for English
# English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
nlp = spacy.load('en_core_web_sm')


# stop words built in spacy (english)
print(nlp.Defaults.stop_words)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [4]:
print(f"Number of default stop words : {len(nlp.Defaults.stop_words)}")

Number of default stop words : 326


In [5]:
# Checking if a word is a stop word
nlp.vocab['is'].is_stop

True

In [6]:
nlp.vocab['below'].is_stop

True

In [7]:
nlp.vocab['btw'].is_stop

False

# Lemmatization

As said before, this dataset also has Lemmatization preprocess feature

In order to understand lemmatization, first we'll look at the concept of Stemming

### Stemming

Stemming is used to return similarities words on the search process. 
 
 - Example: search=boat, also returns "boats" and "boating"

Let's use a sophisticated stemmer, the SnowballStemmer from NLTK (natural language toolkit)

In [8]:
s_stemmer = SnowballStemmer(language='english')

In [9]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness','boats','boating']

In [10]:
for word in words:
    print(word+ ' ------> ' + s_stemmer.stem(word))

run ------> run
runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
fairly ------> fair
fairness ------> fair
boats ------> boat
boating ------> boat


Now, let's look about Lemmatization

### Lemmatization example

In constrast with stemming, Lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a morphological analysis to words.

In [11]:
# Function to display lemmas
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [12]:
doc = nlp(u"I saw eighteen mice today!")
show_lemmas(doc)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [13]:
doc = nlp(u"I am meeting him tomorrow at the meeting.")
show_lemmas(doc)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


Now it's time to talk about Text Feature Extraction

# Text Feature Extraction

After preprocess data it's time to extract features from the text in order to prepare the machine learning model

### Count Vectorization

1. Treats each word of a text individually as a feature

2. After that, counts each occurrence of each word in the document

3. Than, makes a matrix DTM (Document Term Matrix)

In [14]:
count_vect = CountVectorizer()

In [15]:
phrase = ["I'd like to have a glass of water please"]

In [16]:
# Fit Vectorizer to the Data (build a vocab, count the number of words...)
# Learn a vocabulary dictionary of all tokens in the raw documents
count_vect.fit(phrase)

CountVectorizer()

In [17]:
# Show features
count_vect.get_feature_names()

['glass', 'have', 'like', 'of', 'please', 'to', 'water']

In [18]:
# Learn the vocabulary dictionary and return document-term matrix
count_vect.fit_transform(phrase)

<1x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [19]:
# shows a mapping of terms to feature indices.
count_vect.vocabulary_

{'like': 2, 'to': 5, 'have': 1, 'glass': 0, 'of': 3, 'water': 6, 'please': 4}

## TfidVectorizer

An alternative to CountVectorizer is the TfidVectorizer

TfidVectorizer calculates an inverse frequency for each word

It converts a collection of raw documents to a matrix of TF-IDF features.

TfidVectorizer will be used to create the machine learning model for this study

---

---

# Read the dataset into a pandas dataframe

Now it's time to read the dataset and make a simple exploratory analysis

'G:\\내 드라이브\\Github\\2022-Data-ton'

In [24]:
df = pd.read_csv('../dataset/MBTI_train.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/MBTI_train.csv'

In [21]:
df.head()

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [22]:
df['posts'][0]

'know intj tool use interaction people excuse antisocial truly enlighten mastermind know would count pet peeze something time matter people either whether group people mall never see best friend sit outside conversation jsut listen want interject sit formulate say wait inject argument thought find fascinate sit watch people talk people fascinate sit class watch different people find intrigue dad intj u stand look like line safeway watch people home talk people like think military job people voluntarily go job important show deference endanger live glorify way civilian think pretty ignorant general think military necessary defense mechanism political tactic feel like u specifically invest much money could put money education whatnot though personally sound budget aernative really comment one way base two politician eye year ago come name somewhat important kinda role model nowadays pick keep score individual level mean little vary accord number condition day may score high others low sw

In [23]:
df['type'][0]

'INTJ'

In [24]:
df['type'].unique()

array(['INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP', 'ENFJ', 'ENFP',
       'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP'],
      dtype=object)

In [25]:
print(f"Total of {len(df['type'].unique())} types of classified MBTI posts")

Total of 16 types of classified MBTI posts


# Checking null values

In [26]:
df.isnull().sum()

posts    0
type     0
dtype: int64

# Checking the number of posts per type

In [27]:
df_bar_chart=df.groupby('type').count()


trace1 = go.Bar(x=df_bar_chart.index, y=df_bar_chart['posts'])

data = [trace1]
layout = go.Layout(title='MBTI # Classified Posts per Type')

fig = go.Figure(data=data, layout=layout)

fig.show()

# Recreate the model?

This machine learning model takes it's time to train data

To avoid waiting every time, We're going to use the feature dump/load from pickle

In [28]:
# Flag to re-create or not the machine learning model
recreate_model=False

In [29]:
# We'll save the model into a file:
filename = 'mbti_svm_v2.sav'

In [30]:
# If the model file doesn't exists
if not os.path.isfile(filename):
    recreate_model=True

# Model

The machine learning supervised model that we'll use here is a Classification kind, named Support Vector Machine

References:

 - https://en.wikipedia.org/wiki/Support-vector_machine

- https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

### Model Pipeline

We're going to need a pipelined model ir order to facilitate the entire process of CountVectorizer (TfidfVectorizer) and svm.LinearSVC model

To do that, we're going to use the Pipeline feature from sklearn.pipeline

References: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

# Split the data into train and test

In [31]:
X = df['posts'] # features
y = df['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model, save it to disk and open to make predictions

In [32]:
# Check if need to recreate the model
if recreate_model:    
    
    # Creating an instance to vectorizer:
    vectorizer = TfidfVectorizer()
    
    # Training the vectorizer:
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # Training the classifier:
    clf = LinearSVC()
    clf.fit(X_train_tfidf, y_train)
    
    # Pipelining the vectorizer and the classifier
    text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
    text_clf.fit(X_train, y_train)
    
    # saving the model to disk
    pickle.dump(text_clf, open(filename, 'wb'))

# If there is no need to recreate the model, just open the file from the disk    
else:
    # loading the model from disk
    text_clf = pickle.load(open(filename, 'rb'))

# Using the test data to make predictions and analyze the accurace of the model

In [33]:
predictions = text_clf.predict(X_test)

In [34]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.84      0.58      0.69       319
        ENFP       0.82      0.78      0.80      1249
        ENTJ       0.90      0.80      0.84       577
        ENTP       0.86      0.83      0.84      2324
        ESFJ       0.83      0.45      0.59        33
        ESFP       0.88      0.48      0.62        75
        ESTJ       0.90      0.84      0.87       105
        ESTP       0.95      0.90      0.92       398
        INFJ       0.81      0.84      0.83      2954
        INFP       0.80      0.82      0.81      2391
        INTJ       0.83      0.87      0.85      4531
        INTP       0.84      0.87      0.86      5033
        ISFJ       0.80      0.61      0.69       132
        ISFP       0.81      0.60      0.69       161
        ISTJ       0.86      0.68      0.76       253
        ISTP       0.89      0.79      0.84       679

    accuracy                           0.84     21214
   macro avg       0.85   

In [35]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),2)}")

Overall accuracy of the model: 0.84


# End