In [None]:
!pip install konlpy
!pip install --upgrade gensim
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from konlpy.tag import Kkma
from tqdm import tqdm
import pickle
import datetime
import time
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

import torch
from catboost import CatBoostClassifier
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
label2id_1 = {x:i for i, x in enumerate(sorted(file0.digit_1.unique()))}
id2label_1 = {i:x for i, x in enumerate(sorted(file0.digit_1.unique()))}
label2id_2 = {x:i for i, x in enumerate(sorted(file0.digit_2.unique()))}
id2label_2 = {i:x for i, x in enumerate(sorted(file0.digit_2.unique()))}
label2id_3 = {x:i for i, x in enumerate(sorted(file0.digit_3.unique()))}
id2label_3 = {i:x for i, x in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0.digit_1.map(lambda x: label2id_1[x])
file0['digit_2'] = file0.digit_2.map(lambda x: label2id_2[x])
file0['digit_3'] = file0.digit_3.map(lambda x: label2id_3[x])

In [None]:
kkma = Kkma()
def extract_n(x):
    pos_lst = kkma.pos(x)
    for word, pos in pos_lst:
        if pos.startswith('N'):
            yield word

In [None]:
with open('/content/drive/MyDrive/공모전/models/tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)
tdm = np.load('/content/drive/MyDrive/공모전/data/tdm_tfidf.npy', allow_pickle=True).tolist()

In [None]:
from scipy.sparse.csr import csr_matrix
X_ft = csr_matrix(pd.read_csv('/content/drive/MyDrive/공모전/data/X_ft.csv').values)

In [None]:
from scipy.sparse import hstack
X = hstack([X_ft, tdm]).tocoo()
X = torch.sparse_csr_tensor(torch.LongTensor([X.row.tolist(), X.col.tolist()]),
                         torch.LongTensor(X.data.astype(np.int32)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, file0[['digit_1','digit_2','digit_3']], test_size=.3, random_state=0)

y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_test1, y_test2, y_test3 = y_test.iloc[:,0], y_test.iloc[:,1], y_test.iloc[:,2]

### Modeling

#### CatBoost

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.96237
# F1 Score: 0.9037510396802599

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.8891266666666666
# F1 Score: 0.6632112365844249

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')

#### DNN

In [None]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
dataset1 = TensorDataset(X_train, y_train1.values)
dataset2 = TensorDataset(X_train, y_train2.values)
dataset3 = TensorDataset(X_train, y_train3.values)

# Divide the dataset by randomly selecting samples.
train_dataset1, test_dataset1 = train_test_split(dataset1, test_size=0.3, random_state=0)
train_dataset2, test_dataset2 = train_test_split(dataset2, test_size=0.3, random_state=0)
train_dataset3, test_dataset3 = train_test_split(dataset3, test_size=0.3, random_state=0)

# Divide the dataset by randomly selecting samples.
train_dataset1, val_dataset1 = train_test_split(train_dataset1, test_size=0.3, random_state=0)
train_dataset2, val_dataset2 = train_test_split(train_dataset2, test_size=0.3, random_state=0)
train_dataset3, val_dataset3 = train_test_split(train_dataset3, test_size=0.3, random_state=0)

In [None]:
X_train = torch.FloatTensor(X_train)
X_val = torch.FloatTensor(X_val)
y_train1 = torch.tensor(y_train1.values)
y_train2 = torch.tensor(y_train2.values)
y_train3 = torch.tensor(y_train3.values)
y_val1 = torch.tensor(y_val1.values)
y_val2 = torch.tensor(y_val2.values)
y_val3 = torch.tensor(y_val3.values)