In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)
import seaborn as sns
import os
import re
import missingno as msno
import pickle
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import train_test_split

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')

In [None]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')
code = pd.read_excel('/content/drive/MyDrive/공모전/data/한국표준산업분류(10차)_국문.xlsx', header = None)

In [None]:
idx2label_digit1 = dict(enumerate(sorted(file0.digit_1.unique())))
label2idx_digit1 = {label:idx for idx, label in enumerate(sorted(file0.digit_1.unique()))}
idx2label_digit2 = dict(enumerate(sorted(file0.digit_2.unique())))
label2idx_digit2 = {label:idx for idx, label in enumerate(sorted(file0.digit_2.unique()))}
idx2label_digit3 = dict(enumerate(sorted(file0.digit_3.unique())))
label2idx_digit3 = {label:idx for idx, label in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0['digit_1'].map(lambda x: label2idx_digit1[x])
file0['digit_2'] = file0['digit_2'].map(lambda x: label2idx_digit2[x])
file0['digit_3'] = file0['digit_3'].map(lambda x: label2idx_digit3[x])

In [None]:
# 한 문장으로 합치기 때문에 공백으로 결측값 치환
file0 = file0.fillna("")
file1 = file1.fillna("")

# 한 문장으로 합치기
file0["sen"] = file0["text_obj"] + " " + file0["text_mthd"] + " " + file0["text_deal"]
file1["sen"] = file1["text_obj"] + " " + file1["text_mthd"] + " " + file1["text_deal"]

# 합친 문장 양쪽 공백 제거
file0["sen"] = file0["sen"].apply(lambda x : x.strip())
file1["sen"] = file1["sen"].apply(lambda x : x.strip())

# digit_ 1, 2, 3 항목명 dataframe 만들어놓기
digit1_df = code.loc[3:][code[0].isnull() == False][[0, 1]].reset_index(drop = True).rename(columns = {0 : "digit_1", 1 : "digit_1_text"})
digit2_df = code.loc[3:][code[2].isnull() == False][[2, 3]].reset_index(drop = True).rename(columns = {2 : "digit_2", 3 : "digit_2_text"})
digit3_df = code.loc[3:][code[4].isnull() == False][[4, 5]].reset_index(drop = True).rename(columns = {4 : "digit_3", 5 : "digit_3_text"})

# digit_1 항목명 뒤의 특수기호+숫자 제거
digit1_df["digit_1_text"] = digit1_df["digit_1_text"].apply(lambda x : x.split("(")[0])

### Preprocessing

In [None]:
# BERT에 넣을 DATASET 만드는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
data_digit1 = list(zip(file0['sen'], file0['digit_1']))
train_set, val_set = train_test_split(data_digit1, test_size=0.3, random_state=0)

In [None]:
# Setting parameters
max_len = 64
batch_size = 16
warmup_ratio = 0.1
num_epochs = 2
max_grad_norm = 1
log_interval = 400
learning_rate =  5e-5

In [None]:
#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(train_set, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(val_set, 0, 1, tok, max_len, True, False)

print(data_train[0])

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size)

### Model

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=19,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
model1 = torch.load('/content/drive/MyDrive/공모전/models/kobert_digit_1_model.pt').to(device)
model2 = torch.load('/content/drive/MyDrive/공모전/models/kobert_digit_2_model.pt').to(device)
model3 = torch.load('/content/drive/MyDrive/공모전/models/kobert_digit_3_model.pt').to(device)

### Predict

In [None]:
def predict(model, dataloader):
    model.eval()
    preds = []
    for batch in dataloader:
        batch1, batch2, batch3 = tuple(t.long().to(device) for t in batch)
        with torch.no_grad():
            pred = model(batch1, batch2, batch3)
            pred = pred.max(1, keepdim=True)[1]
            preds.append(pred)
    return preds

In [None]:
batch_size = 16

valid_set = val_dataloader.dataset.sentences
valid_loader = DataLoader(
            valid_set,
            sampler = SequentialSampler(valid_set),
            batch_size = batch_size
        )

In [None]:
preds_digit1_val = torch.cat(predict(model1, valid_loader),axis=0).squeeze()
preds_digit2_val = torch.cat(predict(model2, valid_loader),axis=0).squeeze()
preds_digit3_val = torch.cat(predict(model3, valid_loader),axis=0).squeeze()

In [None]:
val_preds = pd.DataFrame({'digit_1':preds_digit1_val.tolist(), 'digit_2':preds_digit2_val.tolist(), 'digit_3':preds_digit3_val.tolist()})
val_preds.to_csv('/content/drive/MyDrive/공모전/submissions/val_preds_kobert0409', index=False)

In [None]:
def predict_proba(model, dataloader):
    model.eval()
    preds = []
    for batch in dataloader:
        batch1, batch2, batch3 = tuple(t.long().to(device) for t in batch)
        with torch.no_grad():
            pred = model(batch1, batch2, batch3)
            preds.append(pred)
    return preds

In [None]:
batch_size = 16

train_set = train_dataloader.dataset.sentences
train_loader = DataLoader(
            train_set,
            sampler = SequentialSampler(train_set),
            batch_size = batch_size
        )

In [None]:
proba_digit1_train = torch.cat(predict_proba(model1, train_loader),axis=0).cpu().numpy()
proba_digit2_train = torch.cat(predict_proba(model2, train_loader),axis=0).cpu().numpy()
proba_digit3_train = torch.cat(predict_proba(model3, train_loader),axis=0).cpu().numpy()

In [None]:
batch_size = 16

valid_set = val_dataloader.dataset.sentences
valid_loader = DataLoader(
            valid_set,
            sampler = SequentialSampler(valid_set),
            batch_size = batch_size
        )

In [None]:
proba_digit1_valid = torch.cat(predict_proba(model1, valid_loader),axis=0).cpu().numpy()
proba_digit2_valid = torch.cat(predict_proba(model2, valid_loader),axis=0).cpu().numpy()
proba_digit3_valid = torch.cat(predict_proba(model3, valid_loader),axis=0).cpu().numpy()

In [None]:
np.save('/content/drive/MyDrive/공모전/submissions/train_proba1_kobert0409', proba_digit1_train)
np.save('/content/drive/MyDrive/공모전/submissions/train_proba2_kobert0409', proba_digit2_train)
np.save('/content/drive/MyDrive/공모전/submissions/train_proba3_kobert0409', proba_digit3_train)

In [None]:
np.save('/content/drive/MyDrive/공모전/submissions/val_proba1_kobert0409', proba_digit1_valid)
np.save('/content/drive/MyDrive/공모전/submissions/val_proba2_kober0t0409', proba_digit2_valid)
np.save('/content/drive/MyDrive/공모전/submissions/val_proba3_kobert0409', proba_digit3_valid)

### Submisssion

In [None]:
test_list = []
for q in file1['sen']:
    data = [q, '0']
    test_list.append(data)

#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

sub_set = BERTDataset(test_list, 0, 1, tok, max_len, True, False)
sub_dataloader = torch.utils.data.DataLoader(sub_set, batch_size=batch_size, num_workers=5)

In [None]:
batch_size = 16

sub_set = sub_dataloader.dataset.sentences

sub_dataloader = DataLoader(
            sub_set,
            sampler = SequentialSampler(sub_set),
            batch_size = batch_size
        )

In [None]:
def predict_proba(model, dataloader):
    model.eval()
    preds = []
    for batch in dataloader:
        batch1, batch2, batch3 = tuple(t.long().to(device) for t in batch)
        with torch.no_grad():
            pred = model(batch1, batch2, batch3)
            preds.append(pred)
    return preds

In [None]:
proba_digit1_sub = torch.cat(predict_proba(model1, sub_dataloader),axis=0).cpu().numpy()
proba_digit2_sub = torch.cat(predict_proba(model2, sub_dataloader),axis=0).cpu().numpy()
proba_digit3_sub = torch.cat(predict_proba(model3, sub_dataloader),axis=0).cpu().numpy()

In [None]:
np.save('/content/drive/MyDrive/공모전/submissions/sub_proba1_kobert0409', proba_digit1_sub)
np.save('/content/drive/MyDrive/공모전/submissions/sub_proba2_kobert0409', proba_digit2_sub)
np.save('/content/drive/MyDrive/공모전/submissions/sub_proba3_kobert0409', proba_digit3_sub)