# Importing libraries

In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score

import plotly.offline as pyo
import plotly.graph_objs as go

import re

from nltk.stem import WordNetLemmatizer

from tqdm import tqdm

In [8]:
import warnings
warnings.filterwarnings(action='ignore') 

# 데이터 불러오기

In [9]:
df = pd.read_csv('G://내 드라이브/Github/Predictiong_MBTI_for_Internet_Users/MBTI 500.csv', encoding='ISO-8859-1')

In [10]:
df.columns = ['text', 'type']
df = df[['type', 'text']]

In [11]:
df.head()

Unnamed: 0,type,text
0,INTJ,know intj tool use interaction people excuse a...
1,INTJ,rap music ehh opp yeah know valid well know fa...
2,INTJ,preferably p hd low except wew lad video p min...
3,INTJ,drink like wish could drink red wine give head...
4,INTJ,space program ah bad deal meing freelance max ...


In [12]:
print(f"Total of {len(df['type'].unique())} types of classified MBTI posts")

Total of 16 types of classified MBTI posts


# 데이터 전처리 / Train, Validation, Test 분할

In [13]:
#function to clean the text data
def clear_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.text):
        sentence=sentence.lower()
#         removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
#         removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text,data_length

df.text, _=clear_text(df)

100%|████████████████████████████████████████████████████████████████████████| 106067/106067 [00:26<00:00, 3964.93it/s]


In [14]:
df

Unnamed: 0,type,text
0,INTJ,know intj tool use interaction people excuse a...
1,INTJ,rap music ehh opp yeah know valid well know fa...
2,INTJ,preferably p hd low except wew lad video p min...
3,INTJ,drink like wish could drink red wine give head...
4,INTJ,space program ah bad deal meing freelance max ...
...,...,...
106062,INFP,stay frustrate world life want take long nap w...
106063,INFP,fizzle around time mention sure mistake thing ...
106064,INFP,schedule modify hey w intp strong wing underst...
106065,INFP,enfj since january busy schedule able spend li...


## Train : Valid : Test = 7:1:2

In [15]:
X = df['text'] # features
y = df['type']  # labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

In [37]:
len(df)

106067

In [38]:
len(X_train)+len(X_val)+len(X_test)

106067

In [39]:
print(len(X_train)/len(df))

print(len(X_val)/len(df))

print(len(X_test)/len(df))

0.6999915147972507
0.1000028284009164
0.2000056568018328


# Train

## Tf-idf벡터화

In [16]:
# Creating an instance to vectorizer:
vectorizer = TfidfVectorizer(sublinear_tf = True)
    
# Training the vectorizer:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [17]:
X_train_tfidf

<74246x206769 sparse matrix of type '<class 'numpy.float64'>'
	with 24954105 stored elements in Compressed Sparse Row format>

In [22]:

# Training the classifier: SVM   
clf1 = LinearSVC()#(C = 0.475)
clf1.fit(X_train_tfidf, y_train)

LinearSVC()

In [18]:
from sklearn.linear_model import LogisticRegression

# Training the classifier: LogisticRegression   
clf2 = LogisticRegression()
clf2.fit(X_train_tfidf, y_train)

LogisticRegression()

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Training the classifier: LogisticRegression   
clf3 = RandomForestClassifier()
clf3.fit(X_train_tfidf, y_train)

RandomForestClassifier()

---

In [37]:
# Pipelining the vectorizer and the classifier
text_clf1 = Pipeline([('tfidf',TfidfVectorizer(sublinear_tf = True)),('clf',LinearSVC())])

In [19]:
# Pipelining the vectorizer and the classifier
text_clf2 = Pipeline([('tfidf',TfidfVectorizer(sublinear_tf = True)),('clf',LogisticRegression())])

In [25]:
# Pipelining the vectorizer and the classifier
text_clf3 = Pipeline([('tfidf',TfidfVectorizer(sublinear_tf = True)),('clf',RandomForestClassifier())])

In [38]:
text_clf1.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(sublinear_tf=True)),
                ('clf', LinearSVC())])

In [27]:
text_clf2.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(sublinear_tf=True)),
                ('clf', LogisticRegression())])

In [28]:
text_clf3.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(sublinear_tf=True)),
                ('clf', RandomForestClassifier())])

# Validation Set 사용하여 정확도 측정 (개별비교)

In [None]:
# 실행

In [None]:
# predictions = text_clf.predict(X_val)

In [None]:
# pd.DataFrame(predictions)[0].apply(list).sum()

In [None]:
# pd.DataFrame(y_val)['type'].apply(list).sum()

In [None]:
# print(classification_report(pd.DataFrame(predictions)[0].apply(list).sum(), pd.DataFrame(y_val)['type'].apply(list).sum()))

In [None]:
# print(f"Overall accuracy of the model: {round(metrics.accuracy_score(pd.DataFrame(predictions)[0].apply(list).sum(), pd.DataFrame(y_val)['type'].apply(list).sum()),2)}")

# Validation Set 사용하여 정확도 측정 (한번에비교)

In [39]:
predictions1 = text_clf1.predict(X_val)

In [29]:
predictions2 = text_clf2.predict(X_val)

In [30]:
predictions3 = text_clf3.predict(X_val)

In [40]:
print(classification_report(y_val, predictions1))

              precision    recall  f1-score   support

        ENFJ       0.87      0.67      0.76       153
        ENFP       0.81      0.78      0.79       596
        ENTJ       0.87      0.77      0.82       284
        ENTP       0.86      0.82      0.84      1127
        ESFJ       0.56      0.42      0.48        12
        ESFP       0.92      0.32      0.48        37
        ESTJ       0.97      0.74      0.84        46
        ESTP       0.97      0.91      0.94       194
        INFJ       0.83      0.87      0.85      1576
        INFP       0.82      0.82      0.82      1204
        INTJ       0.85      0.87      0.86      2282
        INTP       0.85      0.90      0.87      2487
        ISFJ       0.81      0.57      0.67        67
        ISFP       0.79      0.53      0.64        90
        ISTJ       0.82      0.57      0.67       120
        ISTP       0.87      0.82      0.84       332

    accuracy                           0.84     10607
   macro avg       0.84   

In [34]:
print(classification_report(y_val, predictions2))

              precision    recall  f1-score   support

        ENFJ       0.92      0.52      0.67       153
        ENFP       0.84      0.77      0.80       596
        ENTJ       0.89      0.69      0.78       284
        ENTP       0.87      0.83      0.85      1127
        ESFJ       1.00      0.08      0.15        12
        ESFP       1.00      0.05      0.10        37
        ESTJ       0.97      0.63      0.76        46
        ESTP       0.98      0.86      0.91       194
        INFJ       0.84      0.88      0.86      1576
        INFP       0.81      0.83      0.82      1204
        INTJ       0.82      0.88      0.85      2282
        INTP       0.83      0.92      0.87      2487
        ISFJ       0.95      0.27      0.42        67
        ISFP       0.75      0.42      0.54        90
        ISTJ       0.87      0.38      0.52       120
        ISTP       0.89      0.74      0.81       332

    accuracy                           0.84     10607
   macro avg       0.89   

In [32]:
print(classification_report(y_val, predictions3))

              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       153
        ENFP       0.92      0.02      0.04       596
        ENTJ       1.00      0.13      0.24       284
        ENTP       0.79      0.19      0.31      1127
        ESFJ       0.00      0.00      0.00        12
        ESFP       0.00      0.00      0.00        37
        ESTJ       0.96      0.48      0.64        46
        ESTP       0.97      0.59      0.73       194
        INFJ       0.43      0.63      0.51      1576
        INFP       0.68      0.25      0.37      1204
        INTJ       0.50      0.64      0.56      2282
        INTP       0.45      0.80      0.58      2487
        ISFJ       0.00      0.00      0.00        67
        ISFP       0.00      0.00      0.00        90
        ISTJ       0.00      0.00      0.00       120
        ISTP       1.00      0.02      0.04       332

    accuracy                           0.49     10607
   macro avg       0.48   

In [51]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_val, predictions1),4)}")

Overall accuracy of the model: 0.8441


In [52]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_val, predictions2),4)}")

Overall accuracy of the model: 0.8383


In [53]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_val, predictions3),4)}")

Overall accuracy of the model: 0.4871


# test Set 사용하여 정확도 측정

In [67]:
# 실행

In [42]:
predictions = text_clf1.predict(X_test)

In [43]:
predictions

array(['INTP', 'INTJ', 'INTP', ..., 'INFJ', 'ENTP', 'INFJ'], dtype=object)

In [44]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.85      0.61      0.71       319
        ENFP       0.83      0.79      0.81      1249
        ENTJ       0.90      0.79      0.84       577
        ENTP       0.86      0.83      0.85      2324
        ESFJ       1.00      0.42      0.60        33
        ESFP       0.85      0.44      0.58        75
        ESTJ       0.96      0.82      0.88       105
        ESTP       0.95      0.91      0.93       398
        INFJ       0.83      0.86      0.84      2954
        INFP       0.80      0.83      0.81      2391
        INTJ       0.84      0.88      0.86      4531
        INTP       0.85      0.89      0.87      5033
        ISFJ       0.88      0.53      0.66       132
        ISFP       0.79      0.58      0.67       161
        ISTJ       0.88      0.69      0.78       253
        ISTP       0.89      0.81      0.85       679

    accuracy                           0.84     21214
   macro avg       0.87   

In [46]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),2)}")

Overall accuracy of the model: 0.84


In [47]:
predictions = text_clf2.predict(X_test)

In [48]:
predictions

array(['INTP', 'INTJ', 'INTP', ..., 'INFJ', 'ENTP', 'INFJ'], dtype=object)

In [49]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.84      0.55      0.66       319
        ENFP       0.86      0.78      0.82      1249
        ENTJ       0.92      0.73      0.81       577
        ENTP       0.85      0.82      0.83      2324
        ESFJ       0.00      0.00      0.00        33
        ESFP       1.00      0.13      0.24        75
        ESTJ       0.96      0.69      0.80       105
        ESTP       0.98      0.84      0.91       398
        INFJ       0.83      0.85      0.84      2954
        INFP       0.79      0.85      0.82      2391
        INTJ       0.82      0.89      0.85      4531
        INTP       0.83      0.90      0.86      5033
        ISFJ       0.91      0.23      0.36       132
        ISFP       0.76      0.41      0.53       161
        ISTJ       0.92      0.47      0.62       253
        ISTP       0.92      0.73      0.82       679

    accuracy                           0.83     21214
   macro avg       0.82   

In [50]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),2)}")

Overall accuracy of the model: 0.83


---