In [None]:
!pip install konlpy
!pip install --upgrade gensim
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from konlpy.tag import Kkma
from tqdm import tqdm
import pickle
import datetime
import time
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

import torch
from catboost import CatBoostClassifier
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
label2id_1 = {x:i for i, x in enumerate(sorted(file0.digit_1.unique()))}
id2label_1 = {i:x for i, x in enumerate(sorted(file0.digit_1.unique()))}
label2id_2 = {x:i for i, x in enumerate(sorted(file0.digit_2.unique()))}
id2label_2 = {i:x for i, x in enumerate(sorted(file0.digit_2.unique()))}
label2id_3 = {x:i for i, x in enumerate(sorted(file0.digit_3.unique()))}
id2label_3 = {i:x for i, x in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0.digit_1.map(lambda x: label2id_1[x])
file0['digit_2'] = file0.digit_2.map(lambda x: label2id_2[x])
file0['digit_3'] = file0.digit_3.map(lambda x: label2id_3[x])

In [None]:
kkma = Kkma()
def extract_n(x):
    pos_lst = kkma.pos(x)
    for word, pos in pos_lst:
        if pos.startswith('N'):
            yield word

In [None]:
with open('/content/drive/MyDrive/공모전/models/tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)
tdm = np.load('/content/drive/MyDrive/공모전/data/tdm_tfidf.npy', allow_pickle=True).tolist()
word_count = pd.read_csv('/content/drive/MyDrive/공모전/data/word_count.csv')

In [None]:
X = torch.FloatTensor(tdm.A)

X_train, X_test = train_test_split(X, random_state=0, test_size=.3)
X_train, X_val = train_test_split(X_train, random_state=0, test_size=.3)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(X_train, sampler = RandomSampler(X_train), batch_size = batch_size)
val_dataloader = DataLoader(X_val, sampler = SequentialSampler(X_val), batch_size = batch_size)
test_dataloader = DataLoader(X_test, sampler = SequentialSampler(X_test), batch_size = batch_size)

### AutoEncoder

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
print(torch.cuda.get_device_name(0))

In [None]:
class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(2000, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256)
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(256, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 2000)
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
def train(model, train_dataloader, val_dataloader, optimizer, criterion, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        tr_loss = 0
        num_train_samples = 0

        time0 = time.time()
        for data in train_dataloader:
            data = data.to(device)
            optimizer.zero_grad()
            encoded, decoded = model(data)
            loss = criterion(decoded, data)
            
            tr_loss += loss.item()
            num_train_samples += data.size(0)
            
            loss.backward()
            optimizer.step()
        
        model.eval()
        eval_loss = 0
        num_eval_samples = 0
        for data in val_dataloader:
            data = data.to(device)
            with torch.no_grad():
                encoded, decoded = model(data)
                loss = criterion(decoded, data)
                eval_loss += loss.item()
                num_eval_samples += data.size(0)
        time_tooked = str(datetime.timedelta(seconds = int(round(time.time() - time0))))

        print(f'Epoch: {epoch+1}\tTrain Loss: {tr_loss/num_train_samples:.10f}\t Validation Loss: {eval_loss/num_eval_samples:.10f}\t time: {time_tooked}')

#### Modeling

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

ae = AutoEncoder().to(device)
optimizer = torch.optim.Adam(ae.parameters(), lr = 1e-5)
criterion = torch.nn.MSELoss()

train(ae, train_dataloader, val_dataloader, optimizer, criterion, 200)

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

train(ae, train_dataloader, val_dataloader, optimizer, criterion, 200)

In [None]:
class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(2000, 1536),
            torch.nn.ReLU(),
            torch.nn.Linear(1536, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 512)
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(512, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 1536),
            torch.nn.ReLU(),
            torch.nn.Linear(1536, 2000)
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

ae1 = AutoEncoder().to(device)
optimizer = torch.optim.Adam(ae1.parameters(), lr = 1e-5)
criterion = torch.nn.MSELoss()

train(ae1, train_dataloader, val_dataloader, optimizer, criterion, 50)

In [None]:
torch.save(ae1, '/content/drive/MyDrive/공모전/models/ae1.pt')

#### Evaluate

In [None]:
def evaluate(model, test_dataloader, optimizer, criterion):
    model.eval()
    eval_loss = 0
    num_eval_samples = 0
    for data in test_dataloader:
        data = data.to(device)
        with torch.no_grad():
            encoded, decoded = model(data)
            loss = criterion(decoded, data)
            eval_loss += loss.item()
            num_eval_samples += data.size(0)

    print(f'Test Loss: {eval_loss/num_eval_samples:.10f}')

In [None]:
optimizer = torch.optim.Adam(ae1.parameters(), lr = 1e-5)
criterion = torch.nn.MSELoss()

evaluate(ae1, test_dataloader, optimizer, criterion)

### Feature Extraction

In [None]:
ae1 = torch.load('/content/drive/MyDrive/공모전/models/ae1.pt')

In [None]:
def encoding(model, data):
    model.eval()
    dataloader = DataLoader(data, sampler = RandomSampler(data), batch_size = 32)
    result = []
    for data in tqdm(dataloader):
        data = data.to(device)
        with torch.no_grad():
            encoded, _ = model(data)
            result.append(encoded)
    return result

In [None]:
X_ae = encoding(ae1, X)
X_ae = torch.cat(X_ae).to('cpu').numpy()

### Modeling

#### Only AE

dim: 256

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ae, file0[['digit_1','digit_2','digit_3']], test_size=.3, random_state=0)

y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_test1, y_test2, y_test3 = y_test.iloc[:,0], y_test.iloc[:,1], y_test.iloc[:,2]

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.24025
# F1 Score: 0.027918637193084447

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.16762
# F1 Score: 0.007288170278493933

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')

512

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.24025
# F1 Score: 0.027918637193084447

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.16762
# F1 Score: 0.007288170278493933

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')

### FastText + AE

In [None]:
X_ft = pd.read_csv('/content/drive/MyDrive/공모전/data/X_ft.csv')
X = np.concatenate([X_ft.values, X_ae], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, file0[['digit_1','digit_2','digit_3']], test_size=.3, random_state=0)

y_train1, y_train2, y_train3 = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
y_test1, y_test2, y_test3 = y_test.iloc[:,0], y_test.iloc[:,1], y_test.iloc[:,2]

In [None]:
model1 = CatBoostClassifier(random_state=0, task_type = "GPU")
model1.fit(X_train, y_train1)

print(f'Accuracy: {accuracy_score(y_test1, model1.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test1, model1.predict(X_test), average="macro")}')

# Accuracy: 0.95034
# F1 Score: 0.8689909445586299

In [None]:
model2 = CatBoostClassifier(random_state=0, task_type = "GPU")
model2.fit(X_train, y_train2)

print(f'Accuracy: {accuracy_score(y_test2, model2.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test2, model2.predict(X_test), average="macro")}')

# Accuracy: 0.8792533333333333
# F1 Score: 0.6304174031443878

In [None]:
model3 = CatBoostClassifier(random_state=0, task_type = "GPU")
model3.fit(X_train, y_train3)

print(f'Accuracy: {accuracy_score(y_test3, model3.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test3, model3.predict(X_test), average="macro")}')