# Importing libraries

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
import pickle
import os.path
import plotly.offline as pyo
import plotly.graph_objs as go
# import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
import warnings

warnings.filterwarnings(action='ignore') 

# Text Feature Extraction

After preprocess data it's time to extract features from the text in order to prepare the machine learning model

### Count Vectorization

1. Treats each word of a text individually as a feature

2. After that, counts each occurrence of each word in the document

3. Than, makes a matrix DTM (Document Term Matrix)

In [29]:
count_vect = CountVectorizer()

In [30]:
phrase = ["I'd like to have a glass of water please"]

In [31]:
# Fit Vectorizer to the Data (build a vocab, count the number of words...)
# Learn a vocabulary dictionary of all tokens in the raw documents
count_vect.fit(phrase)

CountVectorizer()

In [32]:
# Show features
count_vect.get_feature_names()

['glass', 'have', 'like', 'of', 'please', 'to', 'water']

In [33]:
# Learn the vocabulary dictionary and return document-term matrix
count_vect.fit_transform(phrase)

<1x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [34]:
# shows a mapping of terms to feature indices.
count_vect.vocabulary_

{'like': 2, 'to': 5, 'have': 1, 'glass': 0, 'of': 3, 'water': 6, 'please': 4}

## TfidVectorizer

An alternative to CountVectorizer is the TfidVectorizer

TfidVectorizer calculates an inverse frequency for each word

It converts a collection of raw documents to a matrix of TF-IDF features.

TfidVectorizer will be used to create the machine learning model for this study

# Read the dataset into a pandas dataframe

Now it's time to read the dataset and make a simple exploratory analysis

In [35]:
df = pd.read_csv('G://내 드라이브/Github/2022-Data-ton/dataset/MBTI_train.csv', header=None)

In [36]:
df.head

<bound method NDFrame.head of           0                                                  1
0      INTP  say process model list like subscriber channel...
1      INFJ  upon much manipulate retail finish like sacrif...
2      INFJ  fit yes certain bff social feel goal go know n...
3      INTJ  complete love within someone ideal joke solvea...
4      ENTJ  public strictly thing person x question person...
...     ...                                                ...
74352  INTP  get consequence process life back kind great b...
74353  ENFP  mundane really right vibe natural u conscious ...
74354  INFP  diva message remember practical lot absorb tel...
74355  INTJ  increase impressive group make recognize get a...
74356  ENTJ  restaurant negative entj resent people sing ma...

[74357 rows x 2 columns]>

In [37]:
df.columns = ['type', 'posts']

In [38]:
df['posts'][0]

'say process model list like subscriber channel region act position since without hear resus help study sense external specialize movement dude thousand yes even get essential part road involve expressiveness like factor never probably le non think dedicate medium task take brain oh architectural would diagnose technology college precisely fi understand dominant dom security run make hard tribe lot stop opinion bad washinon solely best extremely sound since patreon back differently mean differ cia high discrimination make two si holistic though passage would make director continue else task enigma question yet whatever field probably mean part arbitrary hedonism ask post find broaden matter read lot sql essential cuure start language think month demand intp plz like like without exam help get system something einstein though would stream create personal sort relative would k would language k man flat example without typically learn read still teach problem split function mean long ment

In [39]:
df['type'][0]

'INTP'

In [40]:
df['type'].unique()

array(['INTP', 'INFJ', 'INTJ', 'ENTJ', 'ENTP', 'INFP', 'ISTP', 'ISFJ',
       'ENFP', 'ISFP', 'ISTJ', 'ENFJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'],
      dtype=object)

In [41]:
sum(df['type'] == 'ESTJ')

71

In [42]:
print(f"Total of {len(df['type'].unique())} types of classified MBTI posts")

Total of 16 types of classified MBTI posts


# Checking null values

In [43]:
df.isnull().sum()

type     0
posts    0
dtype: int64

# Checking the number of posts per type

In [44]:
df_bar_chart=df.groupby('type').count()


trace1 = go.Bar(x=df_bar_chart.index, y=df_bar_chart['posts'])

data = [trace1]
layout = go.Layout(title='MBTI # Classified Posts per Type')

fig = go.Figure(data=data, layout=layout)

fig.show()

# Recreate the model?

This machine learning model takes it's time to train data

To avoid waiting every time, We're going to use the feature dump/load from pickle

In [45]:
# Flag to re-create or not the machine learning model
recreate_model=False

In [46]:
# We'll save the model into a file:
filename = 'mbti_svm_v2.sav'

In [47]:
# If the model file doesn't exists
if not os.path.isfile(filename):
    recreate_model=True

# Model

The machine learning supervised model that we'll use here is a Classification kind, named Support Vector Machine

References:

 - https://en.wikipedia.org/wiki/Support-vector_machine

- https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

### Model Pipeline

We're going to need a pipelined model ir order to facilitate the entire process of CountVectorizer (TfidfVectorizer) and svm.LinearSVC model

To do that, we're going to use the Pipeline feature from sklearn.pipeline

References: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

# Split the data into train and test

In [76]:
X = df['posts'] # features
y = df['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
y_train1 = y_train.str[0]
y_train2 = y_train.str[1]
y_train3 = y_train.str[2]
y_train4 = y_train.str[3]

y_test1 = y_test.str[0]
y_test2 = y_test.str[1]
y_test3 = y_test.str[2]
y_test4 = y_test.str[3]

# Training the model, save it to disk and open to make predictions

In [78]:
# Check if need to recreate the model
if recreate_model:    
    
    # Creating an instance to vectorizer:
    vectorizer = TfidfVectorizer()
    
    # Training the vectorizer:
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # Training the classifier:
    clf = LinearSVC()
    clf.fit(X_train_tfidf, y_train)
    
    # Pipelining the vectorizer and the classifier
    text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
    text_clf.fit(X_train, y_train)
    
    # saving the model to disk
    pickle.dump(text_clf, open(filename, 'wb'))

# If there is no need to recreate the model, just open the file from the disk    
else:
    # loading the model from disk
    text_clf = pickle.load(open(filename, 'rb'))

In [79]:
vectorizer = TfidfVectorizer()
    # Training the vectorizer:
X_train_tfidf = vectorizer.fit_transform(X_train)
    

# 1
    # Training the classifier:
clf1 = LinearSVC()
clf1.fit(X_train_tfidf, y_train1)
    # Pipelining the vectorizer and the classifier
text_clf1 = Pipeline([('tfidf',TfidfVectorizer()),('clf1',LinearSVC())])
text_clf1.fit(X_train, y_train1)

# 2
    # Training the classifier:
clf2 = LinearSVC()
clf2.fit(X_train_tfidf, y_train1)
    # Pipelining the vectorizer and the classifier
text_clf2 = Pipeline([('tfidf',TfidfVectorizer()),('clf2',LinearSVC())])
text_clf2.fit(X_train, y_train2)

# 3
    # Training the classifier:
clf3 = LinearSVC()
clf3.fit(X_train_tfidf, y_train3)
    # Pipelining the vectorizer and the classifier
text_clf3 = Pipeline([('tfidf',TfidfVectorizer()),('clf3',LinearSVC())])
text_clf3.fit(X_train, y_train3)


# 4
    # Training the classifier:
clf4 = LinearSVC()
clf4.fit(X_train_tfidf, y_train1)
    # Pipelining the vectorizer and the classifier
text_clf4 = Pipeline([('tfidf',TfidfVectorizer()),('clf4',LinearSVC())])
text_clf4.fit(X_train, y_train4)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf4', LinearSVC())])

In [51]:
X_train_tfidf

<59485x152601 sparse matrix of type '<class 'numpy.float64'>'
	with 16845137 stored elements in Compressed Sparse Row format>

# Using the test data to make predictions and analyze the accurace of the model

In [81]:
predictions1 = text_clf1.predict(X_test)

In [82]:
predictions1

array(['E', 'I', 'I', ..., 'I', 'I', 'I'], dtype=object)

In [83]:
print(classification_report(y_test1, predictions1))

              precision    recall  f1-score   support

           E       0.80      0.69      0.74      3308
           I       0.92      0.95      0.93     11564

    accuracy                           0.89     14872
   macro avg       0.86      0.82      0.84     14872
weighted avg       0.89      0.89      0.89     14872



In [84]:
predictions2 = text_clf2.predict(X_test)

In [85]:
predictions2

array(['N', 'N', 'N', ..., 'N', 'N', 'N'], dtype=object)

In [86]:
print(classification_report(y_test2, predictions2))

              precision    recall  f1-score   support

           N       0.96      0.99      0.98     13720
           S       0.84      0.56      0.67      1152

    accuracy                           0.96     14872
   macro avg       0.90      0.78      0.82     14872
weighted avg       0.95      0.96      0.95     14872



In [87]:
predictions3 = text_clf3.predict(X_test)

In [88]:
predictions3

array(['T', 'F', 'F', ..., 'T', 'T', 'T'], dtype=object)

In [89]:
print(classification_report(y_test3, predictions3))

              precision    recall  f1-score   support

           F       0.89      0.86      0.87      5299
           T       0.92      0.94      0.93      9573

    accuracy                           0.91     14872
   macro avg       0.91      0.90      0.90     14872
weighted avg       0.91      0.91      0.91     14872



In [90]:
predictions4 = text_clf4.predict(X_test)

In [91]:
predictions4

array(['P', 'J', 'J', ..., 'J', 'P', 'P'], dtype=object)

In [92]:
print(classification_report(y_test4, predictions4))

              precision    recall  f1-score   support

           J       0.82      0.80      0.81      6425
           P       0.85      0.87      0.86      8447

    accuracy                           0.84     14872
   macro avg       0.84      0.84      0.84     14872
weighted avg       0.84      0.84      0.84     14872



In [95]:
predictions = predictions1 + predictions2 + predictions3 + predictions4

In [96]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.28      0.32      0.30       210
        ENFP       0.60      0.50      0.55       828
        ENTJ       0.59      0.60      0.60       430
        ENTP       0.76      0.64      0.69      1582
        ESFJ       0.33      0.11      0.17        27
        ESFP       0.31      0.24      0.27        51
        ESTJ       0.00      0.00      0.00        11
        ESTP       0.70      0.52      0.60       169
        INFJ       0.73      0.69      0.71      2232
        INFP       0.64      0.70      0.67      1720
        INTJ       0.73      0.75      0.74      3225
        INTP       0.69      0.81      0.75      3493
        ISFJ       0.39      0.21      0.27        92
        ISFP       0.51      0.32      0.39       139
        ISTJ       0.42      0.18      0.25       198
        ISTP       0.72      0.56      0.63       465

    accuracy                           0.69     14872
   macro avg       0.53   

In [97]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),2)}")

Overall accuracy of the model: 0.69


# End

---

# Submit

In [98]:
test_df = pd.read_csv('G://내 드라이브/Github/2022-Data-ton/dataset/MBTI_test.csv', header=None)

In [99]:
test_df.columns = ['posts']

In [100]:
X = df['posts'] # features
y = df['type']  # labels

In [101]:
y1 = y.str[0]
y2 = y.str[1]
y3 = y.str[2]
y4 = y.str[3]

In [102]:
vectorizer = TfidfVectorizer()
    # Training the vectorizer:
X_tfidf = vectorizer.fit_transform(X)
    

# 1
    # Training the classifier:
clf1 = LinearSVC()
clf1.fit(X_tfidf, y1)
    # Pipelining the vectorizer and the classifier
text_clf1 = Pipeline([('tfidf',TfidfVectorizer()),('clf1',LinearSVC())])
text_clf1.fit(X, y1)

# 2
    # Training the classifier:
clf2 = LinearSVC()
clf2.fit(X_tfidf, y2)
    # Pipelining the vectorizer and the classifier
text_clf2 = Pipeline([('tfidf',TfidfVectorizer()),('clf2',LinearSVC())])
text_clf2.fit(X, y2)

# 3
    # Training the classifier:
clf3 = LinearSVC()
clf3.fit(X_tfidf, y3)
    # Pipelining the vectorizer and the classifier
text_clf3 = Pipeline([('tfidf',TfidfVectorizer()),('clf3',LinearSVC())])
text_clf3.fit(X, y3)


# 4
    # Training the classifier:
clf4 = LinearSVC()
clf4.fit(X_tfidf, y4)
    # Pipelining the vectorizer and the classifier
text_clf4 = Pipeline([('tfidf',TfidfVectorizer()),('clf4',LinearSVC())])
text_clf4.fit(X, y4)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf4', LinearSVC())])

In [103]:
test_df

Unnamed: 0,posts
0,get accept ya bite well stop important open lo...
1,offer rebel something war people friend block ...
2,soulmates contradiction easy basic recurrence ...
3,run nature q test let sound sarcastically irri...
4,hour fast fast suspend see strict wampum eight...
...,...
9332,increase thing liquor analyze partner consider...
9333,senior vote wide shame learn talk problem broo...
9334,want relate top understand think read generate...
9335,alot set intjs soooo alone ease cherish anothe...


In [107]:
predictions1 = text_clf1.predict(test_df['posts'])
predictions2 = text_clf2.predict(test_df['posts'])
predictions3 = text_clf3.predict(test_df['posts'])
predictions4 = text_clf4.predict(test_df['posts'])

In [108]:
predictions = predictions1 + predictions2 + predictions3 + predictions4

In [144]:
predictions = pd.DataFrame(predictions)

In [145]:
len(predictions)

9337

In [146]:
predictions.to_csv('R2_team15.csv', index=False, header=False)

In [111]:
R1_team15 = pd.read_csv('G://내 드라이브/Github/2022-Data-ton/R1_team15.csv', header=None)

In [112]:
print(classification_report(R1_team15, predictions))

              precision    recall  f1-score   support

        ENFJ       0.28      0.37      0.32       107
        ENFP       0.67      0.63      0.65       527
        ENTJ       0.58      0.66      0.61       226
        ENTP       0.81      0.65      0.72      1020
        ESFJ       0.20      0.17      0.18         6
        ESFP       0.25      0.32      0.28        22
        ESTJ       0.11      1.00      0.20         1
        ESTP       0.75      0.67      0.71       101
        INFJ       0.81      0.75      0.78      1381
        INFP       0.70      0.76      0.73      1112
        INTJ       0.81      0.79      0.80      2099
        INTP       0.75      0.83      0.79      2295
        ISFJ       0.24      0.26      0.25        35
        ISFP       0.42      0.49      0.45        49
        ISTJ       0.55      0.35      0.43        95
        ISTP       0.74      0.68      0.71       261

    accuracy                           0.75      9337
   macro avg       0.54   

In [114]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(R1_team15, predictions),2)}")

Overall accuracy of the model: 0.75


In [138]:
print(classification_report(R1_team15[0].str[0], pd.Series(predictions).str[0]))

              precision    recall  f1-score   support

           E       0.84      0.77      0.81      2010
           I       0.94      0.96      0.95      7327

    accuracy                           0.92      9337
   macro avg       0.89      0.87      0.88      9337
weighted avg       0.92      0.92      0.92      9337



In [140]:
print(classification_report(R1_team15[0].str[0], pd.Series(predictions).str[0]))
print(classification_report(R1_team15[0].str[1], pd.Series(predictions).str[1]))
print(classification_report(R1_team15[0].str[2], pd.Series(predictions).str[2]))
print(classification_report(R1_team15[0].str[3], pd.Series(predictions).str[3]))

              precision    recall  f1-score   support

           E       0.84      0.77      0.81      2010
           I       0.94      0.96      0.95      7327

    accuracy                           0.92      9337
   macro avg       0.89      0.87      0.88      9337
weighted avg       0.92      0.92      0.92      9337

              precision    recall  f1-score   support

           N       0.98      0.99      0.99      8767
           S       0.80      0.74      0.77       570

    accuracy                           0.97      9337
   macro avg       0.89      0.87      0.88      9337
weighted avg       0.97      0.97      0.97      9337

              precision    recall  f1-score   support

           F       0.89      0.89      0.89      3239
           T       0.94      0.94      0.94      6098

    accuracy                           0.93      9337
   macro avg       0.92      0.92      0.92      9337
weighted avg       0.93      0.93      0.93      9337

              preci