In [None]:
!pip install pytorch_pretrained_bert
!pip install konlpy
!pip install --upgrade gensim
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from konlpy.tag import Kkma
from tqdm import tqdm
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.fasttext import FastText

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
documents = file0[['text_obj', 'text_mthd', 'text_deal']].fillna('')
documents = documents.apply(lambda x: ' '.join(x).strip(), axis=1)

In [None]:
kkma = Kkma()
def extract_n(x):
    pos_lst = kkma.pos(x)
    for word, pos in pos_lst:
        if pos.startswith('N'):
            yield word

In [None]:
tfidf = TfidfVectorizer(tokenizer=extract_n)
tdm = tfidf.fit_transform(documents)
words = tfidf.get_feature_names()

In [None]:
word_count = pd.DataFrame({'단어':words,'빈도':tdm.sum(axis=0).flat})

In [None]:
with open('/content/drive/MyDrive/공모전/models/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
np.save('/content/drive/MyDrive/공모전/data/tdm_tfidf.npy', tdm)
word_count.to_csv('/content/drive/MyDrive/공모전/data/word_count.csv')

In [None]:
catb2 = CatBoostClassifier(random_state=0, task_type='GPU')
catb2.fit(X_train, y_train2)

In [None]:
with open('/content/drive/MyDrive/공모전/catb2.pkl', 'wb') as f:
    pickle.dump(catb2, f)

In [None]:
preds = catb2.predict(X_val)
print(f'accuracy: {accuracy_score(y_val2, preds)}')
print(f'f1_score: {f1_score(y_val2, preds, average="macro")}')

### Modeling

#### Read Data

In [None]:
X = pd.read_csv('/content/drive/MyDrive/공모전/data/X_ft.csv')
y = pd.read_csv('/content/drive/MyDrive/공모전/data/y.csv')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=0)

In [None]:
y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_val1, y_val2, y_val3 = y_val.iloc[:,0], y_val.iloc[:,1], y_val.iloc[:,2]

In [None]:
def make_oh_row_digit1(x):
    row_digit1 = np.zeros(19)
    row_digit1[x] = 1
    return row_digit1
def oh_enc_digit1_preds(x):
    df = pd.DataFrame(map(make_oh_row_digit1,x), columns=[f'digit1_{i}' for i in range(19)])
    return df

In [None]:
labels = sorted(y_train2.unique())
def make_oh_row_digit2(x):
    row_digit2 = {f'digit2_{i}':0 for i in labels}
    row_digit2[f'digit2_{x}'] = 1
    return row_digit2
def oh_enc_digit2_preds(x):
    df = pd.DataFrame(map(make_oh_row_digit2,x), columns=[f'digit2_{i}' for i in labels])
    return df

In [None]:
with open('/content/drive/MyDrive/공모전/models/catb1.pickle', 'rb') as f:
    catb1 = pickle.load(f)
with open('/content/drive/MyDrive/공모전/models/catb2_with_target_oh.pickle', 'rb') as f:
    catb2 = pickle.load(f)

In [None]:
model_files = glob('/content/drive/MyDrive/공모전/models/*.pickle')

#### Predict Digit_2

In [None]:
preds1 = oh_enc_digit1_preds(catb1.predict(X_val).flatten())
preds1.index = X_val.index
X_train1 = pd.concat((X_train, pd.get_dummies(y_train1, prefix='digit1')), axis=1)
X_val1 = pd.concat((X_val, preds1), axis=1)

In [None]:
rf2 = RandomForestClassifier(random_state=0, n_jobs=-1)
xgb2 = XGBClassifier(random_state=0, n_jobs=-1)
catb2 = CatBoostClassifier(random_state=0)

In [None]:
rf2.fit(X_train1, y_train2)
rf2_preds = rf2.predict(X_val1)

print(f'accuracy: {accuracy_score(y_val2, rf2_preds)}')
print(f'f1_score: {f1_score(y_val2, rf2_preds, average="macro")}')

# accuracy: 0.8981333333333333
# f1_score: 0.6517356818968152

In [None]:
preds1_ = oh_enc_digit1_preds(y_val1)
preds1_.index = X_val.index
X_val1_ = pd.concat((X_val, preds1_), axis=1)

print(f'accuracy: {accuracy_score(y_val2, rf2.predict(X_val1_))}')
print(f'f1_score: {f1_score(y_val2, rf2.predict(X_val1_), average="macro")}')

# accuracy: 0.93519
# f1_score: 0.7191561540656943

In [None]:
xgb2.fit(X_train1, y_train2)
xgb2_preds = xgb2.predict(X_val1)

print(f'accuracy: {accuracy_score(y_val2, xgb2_preds)}')
print(f'f1_score: {f1_score(y_val2, xgb2_preds, average="macro")}')

# accuracy: 0.8908533333333334
# f1_score: 0.6272603269039095

In [None]:
preds1_ = oh_enc_digit1_preds(y_val1)
preds1_.index = X_val.index
X_val1_ = pd.concat((X_val, preds1_), axis=1)

print(f'accuracy: {accuracy_score(y_val2, xgb2.predict(X_val1_))}')
print(f'f1_score: {f1_score(y_val2, xgb2.predict(X_val1_), average="macro")}')

# accuracy: 0.9276533333333333
# f1_score: 0.7033969202837673

In [None]:
catb2 = CatBoostClassifier(random_state=0)
catb2.fit(X_train1, y_train2)
catb2_preds = catb2.predict(X_val1)

print(f'accuracy: {accuracy_score(y_val2, catb2_preds)}')
print(f'f1_score: {f1_score(y_val2, catb2_preds, average="macro")}')

# accuracy: 0.8970733333333333
# f1_score: 0.658355770828779

In [None]:
preds1_ = oh_enc_digit1_preds(y_val1)
preds1_.index = X_val.index
X_val1_ = pd.concat((X_val, preds1_), axis=1)

print(f'accuracy: {accuracy_score(y_val2, catb2.predict(X_val1_))}')
print(f'f1_score: {f1_score(y_val2, catb2.predict(X_val1_), average="macro")}')

# accuracy: 0.93505
# f1_score: 0.7345941408401843

In [None]:
with open('/content/drive/MyDrive/공모전/models/rf2_with_target_oh.pickle', 'wb') as f:
    pickle.dump(rf2, f)
with open('/content/drive/MyDrive/공모전/models/xgb2_with_target_oh.pickle', 'wb') as f:
    pickle.dump(xgb2, f)
with open('/content/drive/MyDrive/공모전/models/catb2_with_target_oh.pickle', 'wb') as f:
    pickle.dump(catb2, f)

#### CatBoost with Categorical Feature

In [None]:
X_train1_cf = pd.concat((X_train, pd.Series(y_train1, name='digit_1')), axis=1).astype({'digit_1':'str'})
X_val1_cf = pd.concat((X_val, pd.Series(catb1.predict(X_val).flatten(), name='digit_1', index=X_val.index)), axis=1).astype({'digit_1':'str'})

In [None]:
catb2_cf = CatBoostClassifier(random_state=0, cat_features=['digit_1'], task_type='GPU')
catb2_cf.fit(X_train1_cf, y_train2)
catb2_cf_preds = catb2_cf.predict(X_val1_cf)

print(f'accuracy: {accuracy_score(y_val2, catb2_cf_preds)}')
print(f'f1_score: {f1_score(y_val2, catb2_cf_preds, average="macro")}')

# accuracy: 0.8885633333333334
# f1_score: 0.641130529014949

In [None]:
# True digit_1 입력 시
X_val1_ = pd.concat((X_val, y_val1), axis=1)

print(f'accuracy: {accuracy_score(y_val2, catb2_cf.predict(X_val1_))}')
print(f'f1_score: {f1_score(y_val2, catb2_cf.predict(X_val1_), average="macro")}')

# accuracy: 0.9236966666666667
# f1_score: 0.7005592401024673

In [None]:
with open('/content/drive/MyDrive/공모전/models/catb2_with_target_categorical_feature.pickle', 'wb') as f:
    pickle.dump(catb2_cf, f)

#### Predict Digit_3

In [None]:
preds1 = oh_enc_digit1_preds(catb1.predict(X_val).flatten())
preds1.index = X_val.index
X_train1 = pd.concat((X_train, pd.get_dummies(y_train1, prefix='digit1')), axis=1)
X_val1 = pd.concat((X_val, preds1), axis=1)

preds2 = oh_enc_digit2_preds(catb2.predict(X_val1).flatten())
preds2.index = X_val.index
X_train2 = pd.concat((X_train, pd.get_dummies(y_train1, prefix='digit1'), pd.get_dummies(y_train2, prefix='digit2')), axis=1)
X_val2 = pd.concat((X_val, preds1, preds2), axis=1)

In [None]:
catb3 = CatBoostClassifier(random_state=0, task_type='GPU')
catb3.fit(X_train2, y_train3)
catb3_preds = catb3.predict(X_val2)

print(f'accuracy: {accuracy_score(y_val3, catb3_preds)}')
print(f'f1_score: {f1_score(y_val3, catb3_preds, average="macro")}')

# accuracy: 0.79738
# f1_score: 0.44937384426138416

In [None]:
# True digit_1, digit_2 입력 시
preds1 = oh_enc_digit1_preds(y_val1)
preds1.index = X_val.index
preds2 = oh_enc_digit2_preds(y_val2)
preds2.index = X_val.index

X_val2 = pd.concat((X_val, preds1, preds2), axis=1)

print(f'accuracy: {accuracy_score(y_val3, catb3.predict(X_val2))}')
print(f'f1_score: {f1_score(y_val3, catb3.predict(X_val2), average="macro")}')

In [None]:
preds2 = oh_enc_digit2_preds(catb2.predict(X_val1).flatten())
preds2.index = X_val.index
X_train2_ = pd.concat((X_train, pd.get_dummies(y_train2, prefix='digit2')), axis=1)
X_val2_ = pd.concat((X_val, preds2), axis=1)

In [None]:
catb3 = CatBoostClassifier(random_state=0, task_type='GPU')
catb3.fit(X_train2_, y_train3)
catb3_preds = catb3.predict(X_val2_)

print(f'accuracy: {accuracy_score(y_val3, catb3_preds)}')
print(f'f1_score: {f1_score(y_val3, catb3_preds, average="macro")}')

# accuracy: 0.7903866666666667
# f1_score: 0.43760517559254125

In [None]:
catb3 = CatBoostClassifier(random_state=0, task_type='GPU')
catb3.fit(X_train, y_train3)
catb3_preds = catb3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, catb3_preds)}')
print(f'f1_score: {f1_score(y_val3, catb3_preds, average="macro")}')

# accuracy: 0.8038333333333333
# f1_score: 0.4812379699061563

#### Correlation

In [None]:
corrs3 = pd.concat([X_train2, y_train3], axis=1).corr()

In [None]:
corrs3.iloc[:,-1:].applymap(lambda x: abs(x)).sort_values('digit_3',ascending=True)['digit_3'].iloc[:10]

In [None]:
corrs3.iloc[:,-1:].applymap(lambda x: abs(x)).sort_values('digit_3',ascending=False)['digit_3'].iloc[:10]