In [None]:
!pip install --upgrade gensim

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from tqdm import tqdm, trange
import pickle
import random
import time

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.fasttext import FastText

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score

import torch

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
idx2label_digit1 = dict(enumerate(sorted(file0.digit_1.unique())))
label2idx_digit1 = {label:idx for idx, label in enumerate(sorted(file0.digit_1.unique()))}
idx2label_digit2 = dict(enumerate(sorted(file0.digit_2.unique())))
label2idx_digit2 = {label:idx for idx, label in enumerate(sorted(file0.digit_2.unique()))}
idx2label_digit3 = dict(enumerate(sorted(file0.digit_3.unique())))
label2idx_digit3 = {label:idx for idx, label in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0['digit_1'].map(lambda x: label2idx_digit1[x])
file0['digit_2'] = file0['digit_2'].map(lambda x: label2idx_digit2[x])
file0['digit_3'] = file0['digit_3'].map(lambda x: label2idx_digit3[x])

In [None]:
X = pd.read_csv('/content/drive/MyDrive/공모전/data/X_ft.csv')
y = pd.read_csv('/content/drive/MyDrive/공모전/data/y.csv')

In [None]:
y['digit_2'] = y['digit_2'].map(lambda x: label2idx_digit2[x])
y['digit_3'] = y['digit_3'].map(lambda x: label2idx_digit3[x])

y1, y2, y3 = y.iloc[:,0], y.iloc[:,1], y.iloc[:,2]

In [None]:
X = torch.FloatTensor(X.values)
y1 = torch.tensor(y1.values)
y2 = torch.tensor(y2.values)
y3 = torch.tensor(y3.values)

In [None]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
dataset1 = TensorDataset(X, y1)
dataset2 = TensorDataset(X, y2)
dataset3 = TensorDataset(X, y3)

# Divide the dataset by randomly selecting samples.
train_dataset1, test_dataset1 = train_test_split(dataset1, test_size=0.3, random_state=0)
train_dataset2, test_dataset2 = train_test_split(dataset2, test_size=0.3, random_state=0)
train_dataset3, test_dataset3 = train_test_split(dataset3, test_size=0.3, random_state=0)

# Divide the dataset by randomly selecting samples.
train_dataset1, val_dataset1 = train_test_split(train_dataset1, test_size=0.3, random_state=0)
train_dataset2, val_dataset2 = train_test_split(train_dataset2, test_size=0.3, random_state=0)
train_dataset3, val_dataset3 = train_test_split(train_dataset3, test_size=0.3, random_state=0)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. Batch size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader1 = DataLoader(
            train_dataset1,  # The training samples.
            sampler = RandomSampler(train_dataset1), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
train_dataloader2 = DataLoader(
            train_dataset2,  # The training samples.
            sampler = RandomSampler(train_dataset2), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
train_dataloader3 = DataLoader(
            train_dataset3,  # The training samples.
            sampler = RandomSampler(train_dataset3), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader1 = DataLoader(
            val_dataset1, # The validation samples.
            sampler = SequentialSampler(val_dataset1), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
validation_dataloader2 = DataLoader(
            val_dataset2, # The validation samples.
            sampler = SequentialSampler(val_dataset2), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
validation_dataloader3 = DataLoader(
            val_dataset3, # The validation samples.
            sampler = SequentialSampler(val_dataset3), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
test_dataloader1 = DataLoader(
            test_dataset1, # The validation samples.
            sampler = SequentialSampler(val_dataset1), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
test_dataloader2 = DataLoader(
            test_dataset2, # The validation samples.
            sampler = SequentialSampler(val_dataset2), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
test_dataloader3 = DataLoader(
            test_dataset3, # The validation samples.
            sampler = SequentialSampler(val_dataset3), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

### Modeling

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_cols, n_target, node1, node2, node3, node4, dropout_prob):
        super().__init__()
        self.fc1 = torch.nn.Linear(n_cols, node1)
        self.fc2 = torch.nn.Linear(node1, node2)
        self.fc3 = torch.nn.Linear(node2, node3)
        self.fc4 = torch.nn.Linear(node3, node4)
        self.fc5 = torch.nn.Linear(node4, n_target)
        self.dropout_prob = dropout_prob
        self.layer_norm1 = torch.nn.LayerNorm(node1)
        self.layer_norm2 = torch.nn.LayerNorm(node2)
        self.layer_norm3 = torch.nn.LayerNorm(node3)
        self.layer_norm4 = torch.nn.LayerNorm(node4)
    def forward(self, x):
        x = torch.nn.functional.relu(self.layer_norm1(self.fc1(x)))
        x_ = torch.nn.functional.dropout(x, training=self.training, p=self.dropout_prob)
        x = torch.nn.functional.relu(self.layer_norm2(self.fc2(x_)))
        x = torch.nn.functional.dropout(x, training=self.training, p=self.dropout_prob)
        x = torch.nn.functional.relu(self.layer_norm3(self.fc3(x)))
        x = torch.nn.functional.dropout(x, training=self.training, p=self.dropout_prob)
        x = x+x_
        x = torch.nn.functional.relu(self.layer_norm4(self.fc4(x)))
        x = torch.nn.functional.dropout(x, training=self.training, p=self.dropout_prob)
        x = self.fc5(x)
        return x

In [None]:
def weight_init(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight.data)

In [None]:
def train(model, train_dataset, val_dataset, num_epochs, optimizer, criterion, scheduler, log_interval=40, start_epoch=0, train_loss_set=[], valid_loss_set=[]):
    for epoch in range(start_epoch, start_epoch+num_epochs):

        t0 = time.time()
        total_train_loss = 0
        num_train_samples = 0

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch+1, start_epoch+num_epochs))
        print('Training...')

        for batch_idx, (X, y) in enumerate(train_dataset):
            X = X.to(device)
            y = y.to(device)
            model.train()
            optimizer.zero_grad()
            output_train = model(X)
            train_loss = criterion(output_train, y)
            train_loss.backward()
            optimizer.step()

            total_train_loss += train_loss.item()
            num_train_samples += y.size(0)

            # Accuracy
            prediction = output_train.max(1, keepdim=True)[1]
            total_train_accuracy = prediction.eq(y.view_as(prediction)).sum().item()
            avg_train_accuracy = total_train_accuracy / len(X)

            if batch_idx % log_interval == 0 and batch_idx != 0:
                train_time = format_time(time.time() - t0)

                print(f'Train Epoch: {epoch+1}\tTrain Loss: {train_loss},\tTrain Accuracy: {avg_train_accuracy},\tTime: {train_time}')

        epoch_train_loss = total_train_loss/num_train_samples
        train_loss_set.append(epoch_train_loss)
        
        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables 
        eval_loss = 0
        num_eval_samples = 0
        total_eval_accuracy = 0 

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Evaluate data for one epoch
        for batch in val_dataset:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            X, y = batch
            # Telling the model not to compute or store gradients,
            # saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate validation loss
                pred = model(X)
                loss = criterion(pred, y)
                # store valid loss
                eval_loss += loss.item()
                num_eval_samples += y.size(0)
                prediction = pred.max(1, keepdim=True)[1]
                total_eval_accuracy += prediction.eq(y.view_as(prediction)).sum().item()

        epoch_eval_loss = eval_loss/num_eval_samples
        valid_loss_set.append(epoch_eval_loss)
        
        validation_time = format_time(time.time() - t0)

        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / num_eval_samples
        print(f'Epoch: {epoch+1},\t Validation Loss: {epoch_eval_loss},\t Validation Accuracy: {avg_val_accuracy},\t Time: {validation_time}')
        print('')
        scheduler.step(epoch_eval_loss)

    return model, train_loss_set, valid_loss_set

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def evaluate(model, test_dataset, criterion):
    model.eval()

    eval_loss = 0
    num_eval_samples = 0
    total_eval_accuracy = 0 

    t0 = time.time()

    for batch in test_dataset:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        X, y = batch
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate validation loss
            pred = model(X)
            loss = criterion(pred, y)
            # store valid loss
            eval_loss += loss.item()
            num_eval_samples += y.size(0)
            prediction = pred.max(1, keepdim=True)[1]
            total_eval_accuracy += prediction.eq(y.view_as(prediction)).sum().item()

    epoch_eval_loss = eval_loss/num_eval_samples
    avg_val_accuracy = total_eval_accuracy / num_eval_samples

    return epoch_eval_loss, avg_val_accuracy

#### digit_1

In [None]:
model1 = Net(n_cols=X.shape[1], n_target=file0.digit_1.nunique(), node1=1024, node2=2048, node3=1024, node4=512, dropout_prob=0.3).to(device)
model1.apply(weight_init)
optimizer1 = torch.optim.Adam(model1.parameters(), lr=1e-4)
criterion1 = torch.nn.CrossEntropyLoss()
scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer1, 'min')

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

model1, train_loss_set1, valid_loss_set1 = train(model1, train_dataloader1, validation_dataloader1, num_epochs=100, optimizer=optimizer1, criterion=criterion1, scheduler=scheduler1, log_interval=1000, train_loss_set=[], valid_loss_set=[])

In [None]:
plt.plot(range(len(train_loss_set1)), train_loss_set1, label='train_loss_set1')
plt.plot(range(len(valid_loss_set1)), valid_loss_set1, label='valid_loss_set1')
plt.legend()
plt.show()

In [None]:
evaluate(model1, test_dataloader1, criterion1)

# (0.005578965617064871, 0.954352380952381)

#### digit_2

In [None]:
model2 = Net(n_cols=X.shape[1], n_target=file0.digit_2.nunique(), node1=1024, node2=2048, node3=1024, node4=512, dropout_prob=0.3).to(device)
model2.apply(weight_init)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=1e-4)
criterion2 = torch.nn.CrossEntropyLoss()
scheduler2 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer2, 'min')

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

model2, train_loss_set2, valid_loss_set2 = train(model2, train_dataloader2, validation_dataloader2, num_epochs=100, optimizer=optimizer2, criterion=criterion2, scheduler=scheduler2, log_interval=1000, train_loss_set=[], valid_loss_set=[])

In [None]:
plt.plot(range(len(train_loss_set2)), train_loss_set2, label='train_loss_set1')
plt.plot(range(len(valid_loss_set2)), valid_loss_set2, label='valid_loss_set1')
plt.legend()
plt.show()

In [None]:
evaluate(model2, test_dataloader2, criterion2)

# (0.011537351459470976, 0.9050714285714285)

#### digit_3

In [None]:
# with skip-connection
model3 = Net(n_cols=X.shape[1], n_target=file0.digit_3.nunique(), node1=1024, node2=2048, node3=1024, node4=512, dropout_prob=0.3).to(device)
model3.apply(weight_init)
optimizer3 = torch.optim.Adam(model3.parameters(), lr=1e-4)
criterion3 = torch.nn.CrossEntropyLoss()
scheduler3 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer3, 'min')

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

model3, train_loss_set3, valid_loss_set3 = train(model3, train_dataloader3, validation_dataloader3, num_epochs=100, optimizer=optimizer3, criterion=criterion3, scheduler=scheduler3, log_interval=1000, train_loss_set=[], valid_loss_set=[])

In [None]:
plt.plot(range(len(train_loss_set3)), train_loss_set3, label='train_loss_set1')
plt.plot(range(len(valid_loss_set3)), valid_loss_set3, label='valid_loss_set1')
plt.legend()
plt.show()

In [None]:
evaluate(model3, test_dataloader3, criterion3)

# (0.0178538942031651, 0.8574809523809523)

In [None]:
torch.save(model1, '/content/drive/MyDrive/공모전/models/dnn_ft1.pt')
torch.save(model2, '/content/drive/MyDrive/공모전/models/dnn_ft2.pt')
torch.save(model3, '/content/drive/MyDrive/공모전/models/dnn_ft3.pt')

### Predict

In [None]:
ft_obj = FastText.load('/content/drive/MyDrive/공모전/models/ft_obj.gensim')
ft_mthd = FastText.load('/content/drive/MyDrive/공모전/models/ft_mthd.gensim')
ft_deal = FastText.load('/content/drive/MyDrive/공모전/models/ft_deal.gensim')

In [None]:
def groupby_digit(data, target_col, text_col):
    target_idx = np.argwhere(list(map(lambda x: x==target_col, data.columns)))[0][0]
    text_idx = np.argwhere(list(map(lambda x: x==text_col, data.columns)))[0][0]
    lst = data[target_col].unique()
    for label in lst:
        globals()[f'lst_{label}'] = []
    for d in data.values:
        globals()[f'lst_{d[target_idx]}'].extend([d[text_idx]])
    res = []
    for label in lst:
        res.append(globals()[f'lst_{label}'])
    return res

In [None]:
def combine_texts(lst):
    res = ' '.join(lst).strip()
    return res

In [None]:
res_obj3 = groupby_digit(file1.fillna('<unk>'), 'digit_3', 'text_obj')
res_mthd3 = groupby_digit(file1.fillna('<unk>'), 'digit_3', 'text_mthd')
res_deal3 = groupby_digit(file1.fillna('<unk>'), 'digit_3', 'text_deal')

In [None]:
def collect_embs(data, col):
    lst = []
    col_name = col[5:]
    for w in tqdm(data.fillna('')[col]):
        lst.append(globals()[f'ft_{col_name}'].wv[w])
    return lst

In [None]:
text_obj = pd.DataFrame(collect_embs(file1.fillna('<unk>'), 'text_obj'), columns=[f'text_obj{i}' for i in range(100)])
text_mthd = pd.DataFrame(collect_embs(file1.fillna('<unk>'), 'text_mthd'), columns=[f'text_mthd{i}' for i in range(100)])
text_deal = pd.DataFrame(collect_embs(file1.fillna('<unk>'), 'text_deal'), columns=[f'text_deal{i}' for i in range(100)])

In [None]:
X_sub = pd.concat([text_obj, text_mthd, text_deal], axis=1)
X_sub = torch.FloatTensor(X_sub.values)

In [None]:
batch_size = 32

sub_dataloader = DataLoader(
            X_sub,
            sampler = RandomSampler(X_sub),
            batch_size = batch_size
        )

In [None]:
model1 = torch.load('/content/drive/MyDrive/공모전/models/dnn_ft1.pt').to(device)
model2 = torch.load('/content/drive/MyDrive/공모전/models/dnn_ft2.pt').to(device)
model3 = torch.load('/content/drive/MyDrive/공모전/models/dnn_ft3.pt').to(device)

In [None]:
def predict(model, dataloader):
    model.eval()
    preds = []
    for batch in dataloader:
        batch = batch.to(device)
        with torch.no_grad():
            pred = model(batch)
            pred = pred.max(1, keepdim=True)[1]
            preds.append(pred)
    return preds

In [None]:
def predict_proba(model, dataloader):
    model.eval()
    preds = []
    for batch in dataloader:
        batch = batch.to(device)
        with torch.no_grad():
            pred = model(batch)
            preds.append(pred)
    return preds

In [None]:
preds_digit1 = torch.cat(predict_proba(model1, sub_dataloader),axis=0).squeeze().cpu().toarray()
preds_digit2 = torch.cat(predict_proba(model2, sub_dataloader),axis=0).squeeze().cpu().toarray()
preds_digit3 = torch.cat(predict_proba(model3, sub_dataloader),axis=0).squeeze().cpu().toarray()

In [None]:
np.save('/content/drive/MyDrive/공모전/submissions/sub_proba1_mlp0409', preds_digit1)
np.save('/content/drive/MyDrive/공모전/submissions/sub_proba2_mlp0409', preds_digit2)
np.save('/content/drive/MyDrive/공모전/submissions/sub_proba3_mlp0409', preds_digit3)

In [None]:
test_set = np.array(test_dataloader2.dataset)[:,0]

In [None]:
batch_size = 32

test_loader = DataLoader(
            test_set,
            sampler = SequentialSampler(test_set),
            batch_size = batch_size
        )

In [None]:
preds_digit1_test = torch.cat(predict(model1, test_loader),axis=0).squeeze()
preds_digit2_test = torch.cat(predict(model2, test_loader),axis=0).squeeze()
preds_digit3_test = torch.cat(predict(model3, test_loader),axis=0).squeeze()

In [None]:
test_preds = pd.DataFrame({'digit_1':preds_digit1_test.tolist(), 'digit_2':preds_digit2_test.tolist(), 'digit_3':preds_digit3_test.tolist()})
test_preds.to_csv('/content/drive/MyDrive/공모전/submissions/val_preds_mlp0409', index=False)

In [None]:
proba_digit1_test = torch.cat(predict_proba(model1, test_loader),axis=0).cpu().numpy()
proba_digit2_test = torch.cat(predict_proba(model2, test_loader),axis=0).cpu().numpy()
proba_digit3_test = torch.cat(predict_proba(model3, test_loader),axis=0).cpu().numpy()

In [None]:
np.save('/content/drive/MyDrive/공모전/submissions/val_proba1_mlp0409', proba_digit1_test)
np.save('/content/drive/MyDrive/공모전/submissions/val_proba2_mlp0409', proba_digit2_test)
np.save('/content/drive/MyDrive/공모전/submissions/val_proba3_mlp0409', proba_digit3_test)

In [None]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
dataset1 = TensorDataset(X, y1)
dataset2 = TensorDataset(X, y2)
dataset3 = TensorDataset(X, y3)

# Divide the dataset by randomly selecting samples.
train_dataset1, test_dataset1 = train_test_split(dataset1, test_size=0.3, random_state=0)
train_dataset2, test_dataset2 = train_test_split(dataset2, test_size=0.3, random_state=0)
train_dataset3, test_dataset3 = train_test_split(dataset3, test_size=0.3, random_state=0)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. Batch size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader1 = DataLoader(
            train_dataset1,  # The training samples.
            sampler = RandomSampler(train_dataset1), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
train_dataloader2 = DataLoader(
            train_dataset2,  # The training samples.
            sampler = RandomSampler(train_dataset2), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
train_dataloader3 = DataLoader(
            train_dataset3,  # The training samples.
            sampler = RandomSampler(train_dataset3), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

In [None]:
train_set = np.array(train_dataloader2.dataset)[:,0]

In [None]:
batch_size = 32

train_loader = DataLoader(
            train_set,
            sampler = SequentialSampler(train_set),
            batch_size = batch_size
        )

In [None]:
proba_digit1_train = torch.cat(predict_proba(model1, train_loader),axis=0).cpu().numpy()
proba_digit2_train = torch.cat(predict_proba(model2, train_loader),axis=0).cpu().numpy()
proba_digit3_train = torch.cat(predict_proba(model3, train_loader),axis=0).cpu().numpy()

In [None]:
batch_size = 32

valid_loader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
proba_digit1_val = torch.cat(predict_proba(model1, valid_loader),axis=0).cpu().numpy()
proba_digit2_val = torch.cat(predict_proba(model2, valid_loader),axis=0).cpu().numpy()
proba_digit3_val = torch.cat(predict_proba(model3, valid_loader),axis=0).cpu().numpy()

In [None]:
proba_digit1_train = np.concatenate([proba_digit1_train, proba_digit1_val], axis=0)
proba_digit2_train = np.concatenate([proba_digit2_train, proba_digit2_val], axis=0)
proba_digit3_train = np.concatenate([proba_digit3_train, proba_digit3_val], axis=0)

In [None]:
np.save('/content/drive/MyDrive/공모전/submissions/train_proba1_mlp0409', proba_digit1_train)
np.save('/content/drive/MyDrive/공모전/submissions/train_proba2_mlp0409', proba_digit2_train)
np.save('/content/drive/MyDrive/공모전/submissions/train_proba3_mlp0409', proba_digit3_train)