In [None]:
!pip install pytorch_pretrained_bert
!pip install konlpy
!pip install --upgrade gensim
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import missingno as msno
from konlpy.tag import Kkma
from tqdm import tqdm
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.fasttext import FastText

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
idx2label = dict(enumerate(np.sort(file0['digit_1'].unique())))
label2idx = {idx2label[i]:i for i in range(len(idx2label))}

### FastText

In [None]:
def groupby_digit(data, target_col, text_col):
    target_idx = np.argwhere(list(map(lambda x: x==target_col, data.columns)))[0][0]
    text_idx = np.argwhere(list(map(lambda x: x==text_col, data.columns)))[0][0]
    lst = data[target_col].unique()
    for label in lst:
        globals()[f'lst_{label}'] = []
    for d in data.values:
        globals()[f'lst_{d[target_idx]}'].extend([d[text_idx]])
    res = []
    for label in lst:
        res.append(globals()[f'lst_{label}'])
    return res

In [None]:
def combine_texts(lst):
    res = ' '.join(lst).strip()
    return res

In [None]:
res_obj3 = pd.Series(map(combine_texts, groupby_digit(file0.fillna(''), 'digit_3', 'text_obj')))
res_mthd3 = pd.Series(map(combine_texts, groupby_digit(file0.fillna(''), 'digit_3', 'text_mthd')))
res_deal3 = pd.Series(map(combine_texts, groupby_digit(file0.fillna(''), 'digit_3', 'text_deal')))

In [None]:
res_obj3 = groupby_digit(file0.fillna('<unk>'), 'digit_3', 'text_obj')
res_mthd3 = groupby_digit(file0.fillna('<unk>'), 'digit_3', 'text_mthd')
res_deal3 = groupby_digit(file0.fillna('<unk>'), 'digit_3', 'text_deal')

In [None]:
ft_obj = FastText(res_obj3, sg=1)
ft_mthd = FastText(res_mthd3, sg=1)
ft_deal = FastText(res_deal3, sg=1)

In [None]:
ft_obj.save('/content/drive/MyDrive/공모전/data/ft_obj.gensim')
ft_mthd.save('/content/drive/MyDrive/공모전/data/ft_mthd.gensim')
ft_deal.save('/content/drive/MyDrive/공모전/data/ft_deal.gensim')

In [None]:
def collect_embs(data, col):
    lst = []
    col_name = col[5:]
    for w in tqdm(data.fillna('')[col]):
        lst.append(globals()[f'ft_{col_name}'].wv[w])
    return lst

In [None]:
text_obj = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_obj'), columns=[f'text_obj{i}' for i in range(100)])
text_mthd = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_mthd'), columns=[f'text_mthd{i}' for i in range(100)])
text_deal = pd.DataFrame(collect_embs(file0.fillna('<unk>'), 'text_deal'), columns=[f'text_deal{i}' for i in range(100)])

In [None]:
X = pd.concat([text_obj, text_mthd, text_deal], axis=1)
y = file0[['digit_1', 'digit_2', 'digit_3']]

In [None]:
y['digit_1'] = y['digit_1'].map(lambda x: label2idx[x])

In [None]:
X.to_csv('/content/drive/MyDrive/공모전/data/X_ft.csv', index=False)
y.to_csv('/content/drive/MyDrive/공모전/data/y.csv', index=False)

### Read Data

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
X = pd.read_csv('/content/drive/MyDrive/공모전/data/X_ft.csv')
y = pd.read_csv('/content/drive/MyDrive/공모전/data/y.csv')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=0)

In [None]:
my_score = make_scorer(f1_score, average='macro')

In [None]:
y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_val1, y_val2, y_val3 = y_val.iloc[:,0], y_val.iloc[:,1], y_val.iloc[:,2]

### Modeling

#### digit1

In [None]:
lr1 = LogisticRegression(random_state=0, n_jobs=-1)
rf1 = RandomForestClassifier(random_state=0, n_jobs=-1)
ada1 = AdaBoostClassifier(random_state=0)
xgb1 = XGBClassifier(random_state=0, n_jobs=-1)
lgbm1 = LGBMClassifier(random_state=0, n_jobs=-1)
catb1 = CatBoostClassifier(random_state=0)
lda1 = LinearDiscriminantAnalysis()
gnb1 = GaussianNB()

In [None]:
lr1.fit(X_train, y_train1)
lr1_preds = lr1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lr1_preds)}')
print(f'f1_score: {f1_score(y_val1, lr1_preds, average="macro")}')

# accuracy: 0.9256666666666666
# f1_score: 0.8049137596547475

In [None]:
rf1.fit(X_train, y_train1)
rf1_preds = rf1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, rf1_preds)}')
print(f'f1_score: {f1_score(y_val1, rf1_preds, average="macro")}')

# accuracy: 0.95256
# f1_score: 0.8630866581460671

In [None]:
ada1.fit(X_train, y_train1)
ada1_preds = ada1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, ada1_preds)}')
print(f'f1_score: {f1_score(y_val1, ada1_preds, average="macro")}')

# accuracy: 0.7771033333333334
# f1_score: 0.5354114883274402

In [None]:
xgb1.fit(X_train, y_train1)
xgb1_preds = xgb1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, xgb1_preds)}')
print(f'f1_score: {f1_score(y_val1, xgb1_preds, average="macro")}')

# accuracy: 0.93393
# f1_score: 0.8271024754075927

In [None]:
lgbm1.fit(X_train, y_train1)
lgbm1_preds = lgbm1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lgbm1_preds)}')
print(f'f1_score: {f1_score(y_val1, lgbm1_preds, average="macro")}')

# accuracy: 0.9266433333333334
# f1_score: 0.7430025000550744

In [None]:
catb1.fit(X_train, y_train1)
catb1_preds = catb1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, catb1_preds)}')
print(f'f1_score: {f1_score(y_val1, catb1_preds, average="macro")}')

# accuracy: 0.9515333333333333
# f1_score: 0.8758887064553805

In [None]:
lda1.fit(X_train, y_train1)
lda1_preds = lda1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, lda1_preds)}')
print(f'f1_score: {f1_score(y_val1, lda1_preds, average="macro")}')

# accuracy: 0.8859166666666667
# f1_score: 0.7637033003943432

In [None]:
gnb1.fit(X_train, y_train1)
gnb1_preds = gnb1.predict(X_val)

print(f'accuracy: {accuracy_score(y_val1, gnb1_preds)}')
print(f'f1_score: {f1_score(y_val1, gnb1_preds, average="macro")}')

# accuracy: 0.7804466666666666
# f1_score: 0.5995814601997584

In [None]:
models = ['lr','rf','lgbm','catb']

xgb1.save_model('/content/drive/MyDrive/공모전/models/xgb1.pickle')
for model in models:
    with open(f'/content/drive/MyDrive/공모전/models/{model}1.pickle', 'wb') as f:
        pickle.dump(globals()[f'{model}1'], f)

#### digit_2

In [None]:
lr2 = LogisticRegression(random_state=0, n_jobs=-1)
rf2 = RandomForestClassifier(random_state=0, n_jobs=-1)
ada2 = AdaBoostClassifier(random_state=0)
xgb2 = XGBClassifier(random_state=0, n_jobs=-1)
lgbm2 = LGBMClassifier(random_state=0, n_jobs=-1)
catb2 = CatBoostClassifier(random_state=0)
lda2 = LinearDiscriminantAnalysis()
gnb2 = GaussianNB()

In [None]:
lr2.fit(X_train, y_train2)
lr2_preds = lr2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lr2_preds)}')
print(f'f1_score: {f1_score(y_val2, lr2_preds, average="macro")}')

# accuracy: 0.8707466666666667
# f1_score: 0.5421678201435408

In [None]:
rf2.fit(X_train, y_train2)
rf2_preds = rf2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, rf2_preds)}')
print(f'f1_score: {f1_score(y_val2, rf2_preds, average="macro")}')

# accuracy: 0.9002166666666667
# f1_score: 0.6489528135644742

In [None]:
ada2.fit(X_train, y_train2)
ada2_preds = ada2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, ada2_preds)}')
print(f'f1_score: {f1_score(y_val2, ada2_preds, average="macro")}')

# accuracy: 0.49741
# f1_score: 0.06977129313951395

In [None]:
xgb2.fit(X_train, y_train2)
xgb2_preds = xgb2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, xgb2_preds)}')
print(f'f1_score: {f1_score(y_val2, xgb2_preds, average="macro")}')

# accuracy: 0.8798
# f1_score: 0.6148944834664218

In [None]:
lgbm2.fit(X_train, y_train2)
lgbm2_preds = lgbm2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lgbm2_preds)}')
print(f'f1_score: {f1_score(y_val2, lgbm2_preds, average="macro")}')

# accuracy: 0.7139766666666667
# f1_score: 0.3331884141225541

In [None]:
catb2.fit(X_train, y_train2)
catb2_preds = catb2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, catb2_preds)}')
print(f'f1_score: {f1_score(y_val2, catb2_preds, average="macro")}')

# accuracy: 0.89928
# f1_score: 0.6675897628003784

In [None]:
lda2.fit(X_train, y_train2)
lda2_preds = lda2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, lda2_preds)}')
print(f'f1_score: {f1_score(y_val2, lda2_preds, average="macro")}')

# accuracy: 0.79108
# f1_score: 0.49471426451114486

In [None]:
gnb2.fit(X_train, y_train2)
gnb2_preds = gnb2.predict(X_val)

print(f'accuracy: {accuracy_score(y_val2, gnb2_preds)}')
print(f'f1_score: {f1_score(y_val2, gnb2_preds, average="macro")}')

# accuracy: 0.72446
# f1_score: 0.3918263958525736

In [None]:
xgb2.save_model('/content/drive/MyDrive/공모전/models/xgb2.pickle')

In [None]:
models = ['lr','rf']

for model in models:
    with open(f'/content/drive/MyDrive/공모전/models/{model}2.pickle', 'wb') as f:
        pickle.dump(globals()[f'{model}2'], f)

#### digit_3

In [None]:
lr3 = LogisticRegression(random_state=0)
rf3 = RandomForestClassifier(random_state=0, n_jobs=-1)
ada3 = AdaBoostClassifier(random_state=0)
xgb3 = XGBClassifier(random_state=0, n_jobs=-1)
lgbm3 = LGBMClassifier(random_state=0, n_jobs=-1)
catb3 = CatBoostClassifier(random_state=0)
lda3 = LinearDiscriminantAnalysis()
gnb3 = GaussianNB()

In [None]:
lr3.fit(X_train, y_train3)
lr3_preds = lr3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lr3_preds)}')
print(f'f1_score: {f1_score(y_val3, lr3_preds, average="macro")}')

# accuracy: 0.8089566666666667
# f1_score: 0.40824603622911876

In [None]:
rf3 = RandomForestClassifier(random_state=0, n_jobs=1, max_depth=100)

In [None]:
rf3.fit(X_train, y_train3)
rf3_preds = rf3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, rf3_preds)}')
print(f'f1_score: {f1_score(y_val3, rf3_preds, average="macro")}')

In [None]:
ada3.fit(X_train, y_train3)
ada3_preds = ada3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, ada3_preds)}')
print(f'f1_score: {f1_score(y_val3, ada3_preds, average="macro")}')

In [None]:
xgb3.fit(X_train, y_train3)
xgb3_preds = xgb3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, xgb3_preds)}')
print(f'f1_score: {f1_score(y_val3, xgb3_preds, average="macro")}')

In [None]:
lgbm3.fit(X_train, y_train3)
lgbm3_preds = lgbm3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lgbm3_preds)}')
print(f'f1_score: {f1_score(y_val3, lgbm3_preds, average="macro")}')

# accuracy: 0.39244
# f1_score: 0.0780157271189942

In [None]:
catb3.fit(X_train, y_train3)
catb3_preds = catb3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, catb3_preds)}')
print(f'f1_score: {f1_score(y_val3, catb3_preds, average="macro")}')

# accuracy: 0.84743
# f1_score: 0.5403724886737582

In [None]:
lda3.fit(X_train, y_train3)
lda3_preds = lda3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, lda3_preds)}')
print(f'f1_score: {f1_score(y_val3, lda3_preds, average="macro")}')

# accuracy: 0.7047333333333333
# f1_score: 0.387297567084451

In [None]:
gnb3.fit(X_train, y_train3)
gnb3_preds = gnb3.predict(X_val)

print(f'accuracy: {accuracy_score(y_val3, gnb3_preds)}')
print(f'f1_score: {f1_score(y_val3, gnb3_preds, average="macro")}')

# accuracy: 0.6435633333333334
# f1_score: 0.30618791704900467

In [None]:
models = ['lr','rf','ada','xgb','lgbm','catb','lda','gnb']

for model in models:
    for i in range(1,4):
        with open(f'/content/drive/MyDrive/공모전/models/{model}{i}.pickle', 'wb') as f:
            pickle.dump(globals()[f'{model}{i}'], f)

In [None]:
with open('/content/drive/MyDrive/공모전/models/catb3.pickle','wb') as f:
    pickle.dump(catb3, f)