In [None]:
from keras_preprocessing.text import text_to_word_sequence, Tokenizer
from nltk.tokenize import WordPunctTokenizer
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict

### File Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path1 = '/content/drive/MyDrive/boaz/base_project/MBTI 500.csv'
file_path2 = '/content/drive/MyDrive/boaz/base_project/Aug_S.csv'
USER_SAMPLE = 1

### Data Load & Check

In [None]:
data = pd.read_csv(file_path1, encoding = 'UTF-8')

In [None]:
if USER_SAMPLE :
  data = data.groupby('type').sample(frac = 0.05) # type 별로 그룹으로 묶은 다음 무작위 표본 추출하기 (비율 유지하면서 표본 추출함) / frac는 추출할 표본 비율
  data.reset_index(drop = True, inplace = True)

### Preprocessing

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5302 entries, 0 to 5301
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   5302 non-null   object
 1   type    5302 non-null   object
dtypes: object(2)
memory usage: 83.0+ KB


In [None]:
# Spliting Data by (E<>I), (S<>N), (F<>T), (P<>J)

data_E = data.copy()[data['type'].str.contains('E')]
data_E['type'] = 'E'

data_I = data.copy()[data['type'].str.contains('I')]
data_I['type'] = 'I'


data_S = data.copy()[data['type'].str.contains('S')]
data_S['type'] = 'S'

data_N = data.copy()[data['type'].str.contains('N')]
data_N['type'] = 'N'

data_T = data.copy()[data['type'].str.contains('T')]
data_T['type'] = 'T'

data_F = data.copy()[data['type'].str.contains('F')]
data_F['type'] = 'F'

data_P = data.copy()[data['type'].str.contains('P')]
data_P['type'] = 'P'

data_J = data.copy()[data['type'].str.contains('J')]
data_J['type'] = 'J'

In [None]:
data_E.head()

Unnamed: 0,posts,type
0,point try albeit hyperbolically make random th...,E
1,already also risk codependent already affect r...,E
2,assumption feel understand thats intuition wor...,E
3,capable achieve goal first hesitate try think ...,E
4,really responsibility alone strongly suggest r...,E


In [None]:
data_P.head()

Unnamed: 0,posts,type
77,day make want prove others capable run head fi...,P
78,ziplines skydive adrenaline booze sex eeeeeeee...,P
79,show interest bug give drink water call number...,P
80,base kind thing idea would make sense context ...,P
81,heahy really much anything throw depression ti...,P


In [None]:
print(f"""data_E.shape: {data_E.shape} \n 
        data_I.shape: {data_I.shape} \n
        data_S.shape: {data_S.shape} \n
        data_N.shape: {data_N.shape} \n
        data_T.shape: {data_T.shape} \n
        data_F.shape: {data_F.shape} \n
        data_J.shape: {data_J.shape} \n
        data_P.shape: {data_P.shape} \n""")

data_E.shape: (1269, 2) 
 
        data_I.shape: (4033, 2) 

        data_S.shape: (459, 2) 

        data_N.shape: (4843, 2) 

        data_T.shape: (3459, 2) 

        data_F.shape: (1843, 2) 

        data_J.shape: (2221, 2) 

        data_P.shape: (3081, 2) 



In [None]:
data.head()

Unnamed: 0,posts,type
0,point try albeit hyperbolically make random th...,ENFJ
1,already also risk codependent already affect r...,ENFJ
2,assumption feel understand thats intuition wor...,ENFJ
3,capable achieve goal first hesitate try think ...,ENFJ
4,really responsibility alone strongly suggest r...,ENFJ


In [None]:
# Adding Augmented data

aug_data = pd.read_csv(file_path2, encoding = 'UTF-8')
data_S = pd.concat([data_S, aug_data])

In [None]:
data_S.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4590 entries, 1119 to 4130
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   4590 non-null   object
 1   type    4590 non-null   object
dtypes: object(2)
memory usage: 107.6+ KB


In [None]:
pre_data = pd.concat([data_E, data_I, data_S, data_N, data_F, data_T, data_J, data_P], axis=0)

In [None]:
pre_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25339 entries, 0 to 5301
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   25339 non-null  object
 1   type    25339 non-null  object
dtypes: object(2)
memory usage: 593.9+ KB


In [None]:
pre_data.head()

Unnamed: 0,posts,type
0,point try albeit hyperbolically make random th...,E
1,already also risk codependent already affect r...,E
2,assumption feel understand thats intuition wor...,E
3,capable achieve goal first hesitate try think ...,E
4,really responsibility alone strongly suggest r...,E


## Tokenizing

### Top 3000 Words

In [None]:
X, y = pre_data.drop('type', axis=1), pre_data['type']

In [None]:
tokenizer_top_words = Tokenizer(oov_token="<OOV>", split=' ', num_words=3000)
tokenizer_top_words.fit_on_texts(X.iloc[:,0])
tokenizer_top_words.index_word

In [None]:
# only 3000 words encoding
# tmp = X.head(10)

X_tp_words = X.copy()
X_tp_words['tok_tw'] = X_tp_words.apply(lambda v: tokenizer_top_words.texts_to_sequences([v['posts']]), axis=1)

In [None]:
X_tp_words['tok_tw'] = X_tp_words.apply(lambda v: np.array(v['tok_tw']).reshape(-1,1).tolist(), axis=1)

In [None]:
X_tp_words['tok_tw_bool']=X_tp_words.apply(lambda v: list(map(lambda t: int(t[0] > 1), v['tok_tw'])), axis = 1)

### Words used more than 1000 times

In [None]:
word_dict = tokenizer_top_words.word_counts
word_dict = OrderedDict(sorted(word_dict.items(), key = lambda t : t[-1],reverse= True))

In [None]:
word_dict_top = []
for i, (key, value) in enumerate(word_dict.items()):
  if value >= 1000:
    word_dict_top.append(key)
print(f'size is {len(word_dict_top)}')

size is 1714


In [None]:
# Boolean encoding
X_freq_words = X.copy()
X_freq_words['tok_tw'] = X_freq_words.apply(lambda v : WordPunctTokenizer().tokenize(v['posts']), axis = 1)

In [None]:
X_freq_words['tok_tw_bool'] = X_freq_words.apply(lambda row : [1 if x in row['tok_tw'] else 0 for x in row['tok_tw']], axis = 1)
X_freq_words

Unnamed: 0,posts,tok_tw,tok_tw_bool
0,point try albeit hyperbolically make random th...,"[point, try, albeit, hyperbolically, make, ran...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,already also risk codependent already affect r...,"[already, also, risk, codependent, already, af...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,assumption feel understand thats intuition wor...,"[assumption, feel, understand, thats, intuitio...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,capable achieve goal first hesitate try think ...,"[capable, achieve, goal, first, hesitate, try,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,really responsibility alone strongly suggest r...,"[really, responsibility, alone, strongly, sugg...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
5297,talk teacher find hate honest open mind see th...,"[talk, teacher, find, hate, honest, open, mind...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5298,find something long time maybe quit without so...,"[find, something, long, time, maybe, quit, wit...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5299,commit suicide live world without conflict bor...,"[commit, suicide, live, world, without, confli...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5300,ou seem hang around r intp lot actually mutlir...,"[ou, seem, hang, around, r, intp, lot, actuall...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
X_freq_words['tok_tw_in'] = X_freq_words.apply(lambda row : [1 if x in word_dict_top else 0 for x in row['tok_tw']], axis = 1)
X_freq_words

Unnamed: 0,posts,tok_tw,tok_tw_bool,tok_tw_in
0,point try albeit hyperbolically make random th...,"[point, try, albeit, hyperbolically, make, ran...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ..."
1,already also risk codependent already affect r...,"[already, also, risk, codependent, already, af...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
2,assumption feel understand thats intuition wor...,"[assumption, feel, understand, thats, intuitio...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, ..."
3,capable achieve goal first hesitate try think ...,"[capable, achieve, goal, first, hesitate, try,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,really responsibility alone strongly suggest r...,"[really, responsibility, alone, strongly, sugg...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ..."
...,...,...,...,...
5297,talk teacher find hate honest open mind see th...,"[talk, teacher, find, hate, honest, open, mind...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5298,find something long time maybe quit without so...,"[find, something, long, time, maybe, quit, wit...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ..."
5299,commit suicide live world without conflict bor...,"[commit, suicide, live, world, without, confli...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, ..."
5300,ou seem hang around r intp lot actually mutlir...,"[ou, seem, hang, around, r, intp, lot, actuall...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ..."


In [None]:
mx = word_dict[next(iter(word_dict))]
X_tp_words['tok_tw_ratio'] = X_freq_words.apply(lambda v: [np.round(word_dict[x]/mx, 2) for x in v['tok_tw']], axis=1)

### Dataframe Print

In [None]:
X_tp_words

Unnamed: 0,posts,tok_tw,tok_tw_bool,tok_tw_ratio
0,point try albeit hyperbolically make random th...,"[[60], [25], [2968], [1], [7], [586], [298], [...","[1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...","[0.16, 0.31, 0.0, 0.0, 0.57, 0.02, 0.05, 0.0, ..."
1,already also risk codependent already affect r...,"[[205], [21], [857], [1], [205], [691], [766],...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.06, 0.33, 0.02, 0.0, 0.06, 0.02, 0.02, 0.13..."
2,assumption feel understand thats intuition wor...,"[[1042], [11], [51], [645], [696], [20], [1], ...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, ...","[0.01, 0.51, 0.19, 0.02, 0.02, 0.33, 0.0, 0.04..."
3,capable achieve goal first hesitate try think ...,"[[1049], [1021], [361], [64], [1], [25], [3], ...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.01, 0.01, 0.04, 0.15, 0.0, 0.31, 0.85, 0.05..."
4,really responsibility alone strongly suggest r...,"[[16], [1200], [217], [1077], [559], [58], [20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, ...","[0.46, 0.01, 0.06, 0.01, 0.03, 0.16, 0.07, 0.1..."
...,...,...,...,...
5297,talk teacher find hate honest open mind see th...,"[[40], [716], [28], [219], [445], [225], [92],...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.22, 0.02, 0.29, 0.06, 0.03, 0.06, 0.12, 0.3..."
5298,find something long time maybe quit without so...,"[[28], [22], [68], [13], [75], [1410], [132], ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.29, 0.32, 0.14, 0.5, 0.14, 0.01, 0.09, 0.32..."
5299,commit suicide live world without conflict bor...,"[[1080], [1860], [89], [86], [132], [747], [34...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, ...","[0.01, 0.01, 0.13, 0.13, 0.09, 0.02, 0.04, 0.5..."
5300,ou seem hang around r intp lot actually mutlir...,"[[2453], [49], [449], [71], [246], [80], [31],...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...","[0.0, 0.19, 0.03, 0.14, 0.06, 0.13, 0.27, 0.19..."


In [None]:
X_freq_words

Unnamed: 0,posts,tok_tw,tok_tw_bool,tok_tw_in
0,point try albeit hyperbolically make random th...,"[point, try, albeit, hyperbolically, make, ran...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ..."
1,already also risk codependent already affect r...,"[already, also, risk, codependent, already, af...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
2,assumption feel understand thats intuition wor...,"[assumption, feel, understand, thats, intuitio...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, ..."
3,capable achieve goal first hesitate try think ...,"[capable, achieve, goal, first, hesitate, try,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,really responsibility alone strongly suggest r...,"[really, responsibility, alone, strongly, sugg...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ..."
...,...,...,...,...
5297,talk teacher find hate honest open mind see th...,"[talk, teacher, find, hate, honest, open, mind...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5298,find something long time maybe quit without so...,"[find, something, long, time, maybe, quit, wit...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ..."
5299,commit suicide live world without conflict bor...,"[commit, suicide, live, world, without, confli...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, ..."
5300,ou seem hang around r intp lot actually mutlir...,"[ou, seem, hang, around, r, intp, lot, actuall...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ..."


#### Modeling

In [None]:
# spliting data for validation
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tp_words['tok_tw_ratio'], y, test_size=0.2, stratify=y, random_state=34)

In [None]:
type(X_train), type(X_test), type(y_train), type(y_test)

(pandas.core.series.Series,
 pandas.core.series.Series,
 pandas.core.series.Series,
 pandas.core.series.Series)

### Naive Bayse

In [None]:
from sklearn.naive_bayes import MultinomialNB # 가우시안 나이브 베이즈 (연속 데이터 적용 가능)
from sklearn.metrics import accuracy_score, classification_report

In [None]:
model = MultinomialNB(alpha=1.0)

In [None]:
X_train.to_list()
X_train

479     [0.0, 0.03, 0.0, 0.0, 0.0, 0.03, 0.02, 0.02, 0...
936     [0.0, 0.07, 0.23, 0.01, 0.04, 0.54, 0.29, 0.0,...
493     [0.09, 0.13, 0.04, 0.01, 0.03, 0.0, 0.02, 0.78...
3499    [0.22, 0.26, 0.01, 0.0, 0.01, 0.02, 0.09, 0.0,...
2021    [0.1, 0.0, 0.01, 0.0, 0.01, 0.08, 0.51, 0.03, ...
                              ...                        
3671    [0.02, 0.0, 0.03, 0.85, 0.0, 0.04, 0.85, 0.1, ...
3436    [0.04, 0.29, 0.01, 0.0, 0.02, 0.23, 0.01, 0.12...
685     [0.27, 0.12, 0.17, 0.46, 0.02, 0.23, 0.13, 0.0...
308     [0.57, 0.85, 0.85, 0.49, 0.03, 0.14, 0.03, 0.0...
2045    [0.01, 0.01, 0.02, 0.0, 0.16, 0.0, 0.3, 0.01, ...
Name: tok_tw_ratio, Length: 20271, dtype: object

In [None]:
model.fit(X_train, y_train)

ValueError: ignored

In [None]:
classification_report(y_true, y_pred)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
params = {
    'n_estimators':[],
    'max_depth':[]
}

rf_clf = RandomForestClassifier(n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=cv, n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [None]:
classification_report(y_true, y_pred)