In [1]:
from keras_preprocessing.text import text_to_word_sequence, Tokenizer
from nltk.tokenize import WordPunctTokenizer
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict

### File Path

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path1 = '/content/drive/MyDrive/boaz/base_project/MBTI 500.csv'
file_path2 = '/content/drive/MyDrive/boaz/base_project/Aug_S.csv'
USER_SAMPLE = 1

### Data Load & Check

In [4]:
data = pd.read_csv(file_path1, encoding = 'UTF-8')

In [5]:
if USER_SAMPLE :
  data = data.groupby('type').sample(frac = 0.05) # type 별로 그룹으로 묶은 다음 무작위 표본 추출하기 (비율 유지하면서 표본 추출함) / frac는 추출할 표본 비율
  data.reset_index(drop = True, inplace = True)

### Preprocessing

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5302 entries, 0 to 5301
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   5302 non-null   object
 1   type    5302 non-null   object
dtypes: object(2)
memory usage: 83.0+ KB


In [7]:
# Spliting Data by (E<>I), (S<>N), (F<>T), (P<>J)

data_E = data.copy()[data['type'].str.contains('E')]
data_E['type'] = 'E'

data_I = data.copy()[data['type'].str.contains('I')]
data_I['type'] = 'I'


data_S = data.copy()[data['type'].str.contains('S')]
data_S['type'] = 'S'

data_N = data.copy()[data['type'].str.contains('N')]
data_N['type'] = 'N'

data_T = data.copy()[data['type'].str.contains('T')]
data_T['type'] = 'T'

data_F = data.copy()[data['type'].str.contains('F')]
data_F['type'] = 'F'

data_P = data.copy()[data['type'].str.contains('P')]
data_P['type'] = 'P'

data_J = data.copy()[data['type'].str.contains('J')]
data_J['type'] = 'J'

In [8]:
data_E.head()

Unnamed: 0,posts,type
0,understand support wait hey enfj wanna know he...,E
1,really sadden sexualize young age thank seem f...,E
2,topic really interest teach u new thing push u...,E
3,amateur vintage radio enthusiast infps heavine...,E
4,set thing wish hobby like one intj friend alwa...,E


In [9]:
data_P.head()

Unnamed: 0,posts,type
77,heartbreaking heartwarming know happy find som...,P
78,last week really intense day mild one try sit ...,P
79,intention mind try censor forget speak let hap...,P
80,well way nature part get happen get blindside ...,P
81,still like type evil mastermind say though cou...,P


In [10]:
print(f"""data_E.shape: {data_E.shape} \n 
        data_I.shape: {data_I.shape} \n
        data_S.shape: {data_S.shape} \n
        data_N.shape: {data_N.shape} \n
        data_T.shape: {data_T.shape} \n
        data_F.shape: {data_F.shape} \n
        data_J.shape: {data_J.shape} \n
        data_P.shape: {data_P.shape} \n""")

data_E.shape: (1269, 2) 
 
        data_I.shape: (4033, 2) 

        data_S.shape: (459, 2) 

        data_N.shape: (4843, 2) 

        data_T.shape: (3459, 2) 

        data_F.shape: (1843, 2) 

        data_J.shape: (2221, 2) 

        data_P.shape: (3081, 2) 



In [11]:
data.head()

Unnamed: 0,posts,type
0,understand support wait hey enfj wanna know he...,ENFJ
1,really sadden sexualize young age thank seem f...,ENFJ
2,topic really interest teach u new thing push u...,ENFJ
3,amateur vintage radio enthusiast infps heavine...,ENFJ
4,set thing wish hobby like one intj friend alwa...,ENFJ


In [12]:
# Adding Augmented data

aug_data = pd.read_csv(file_path2, encoding = 'UTF-8')
data_S = pd.concat([data_S, aug_data])

In [13]:
data_S.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4590 entries, 1119 to 4130
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   4590 non-null   object
 1   type    4590 non-null   object
dtypes: object(2)
memory usage: 107.6+ KB


In [14]:
pre_data = pd.concat([data_E, data_I, data_S, data_N, data_F, data_T, data_J, data_P], axis=0)

In [15]:
pre_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25339 entries, 0 to 5301
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   25339 non-null  object
 1   type    25339 non-null  object
dtypes: object(2)
memory usage: 593.9+ KB


In [16]:
pre_data.head()

Unnamed: 0,posts,type
0,understand support wait hey enfj wanna know he...,E
1,really sadden sexualize young age thank seem f...,E
2,topic really interest teach u new thing push u...,E
3,amateur vintage radio enthusiast infps heavine...,E
4,set thing wish hobby like one intj friend alwa...,E


## Tokenizing

In [18]:
X, y = pre_data.drop('type', axis=1), pre_data['type']

In [None]:
tokenizer_top_words = Tokenizer(oov_token="<OOV>", split=' ')
tokenizer_top_words.fit_on_texts(X.iloc[:, 0])
tokenizer_top_words.index_word

### Top 3000 Words

In [20]:
X_tp_words = pd.DataFrame()
X_tp_words['word_embedding'] = X.apply(lambda v : tokenizer_top_words.texts_to_sequences([v['posts']]), axis = 1)
X_tp_words['word_embedding'] = X_tp_words.apply(lambda v: np.array(v['word_embedding']).reshape(-1).tolist(), axis = 1)

In [21]:
X_tp_words = X_tp_words.to_numpy().reshape(-1)
for i in range(len(X_tp_words)) :
    X_tp_words[i] = np.array(X_tp_words[i]).reshape(-1).tolist()

#### Modeling

In [22]:
# spliting data for validation
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tp_words, y, test_size=0.2, stratify=y, random_state=42)

In [24]:
type(X_train), type(X_test), type(y_train), type(y_test)

(numpy.ndarray,
 numpy.ndarray,
 pandas.core.series.Series,
 pandas.core.series.Series)

### Naive Bayse

In [25]:
from sklearn.naive_bayes import GaussianNB # 가우시안 나이브 베이즈 (연속 데이터 적용 가능)
from sklearn.metrics import accuracy_score, classification_report

In [26]:
model = GaussianNB()

In [27]:
model.fit(X_train, y_train)

ValueError: ignored

#### Metric

메트릭은 accuracy, recall, precision, f1 모두 종합적으로 보고 평가

In [None]:
classification_report(y_true, y_pred)

### Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [29]:
rf_clf = RandomForestClassifier()

In [30]:
rf_clf.fit(X_train, y_train)

ValueError: ignored

In [None]:
pred = rf_clf.predict(X_test)

In [None]:
# 평가
classification_report(y_true, y_pred)