In [None]:
!pip install transformers
!pip install sentencepiece
!pip install git+https://github.com/SKT-AI/KoBART#egg=kobart

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import missingno as msno
from tqdm import tqdm, trange
import pickle
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import BartTokenizer, BartModel, BartForSequenceClassification
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer

In [None]:
files = glob('/content/drive/MyDrive/공모전/data/*.txt')
for i, file in enumerate(files):
    globals()[f'file{i}'] = pd.read_table(file, sep='|', encoding='cp949')

In [None]:
idx2label_digit1 = dict(enumerate(sorted(file0.digit_1.unique())))
label2idx_digit1 = {label:idx for idx, label in enumerate(sorted(file0.digit_1.unique()))}
idx2label_digit2 = dict(enumerate(sorted(file0.digit_2.unique())))
label2idx_digit2 = {label:idx for idx, label in enumerate(sorted(file0.digit_2.unique()))}
idx2label_digit3 = dict(enumerate(sorted(file0.digit_3.unique())))
label2idx_digit3 = {label:idx for idx, label in enumerate(sorted(file0.digit_3.unique()))}

In [None]:
file0['digit_1'] = file0['digit_1'].map(lambda x: label2idx_digit1[x])
file0['digit_2'] = file0['digit_2'].map(lambda x: label2idx_digit2[x])
file0['digit_3'] = file0['digit_3'].map(lambda x: label2idx_digit3[x])

In [None]:
sentences_raw = file0[['text_obj','text_mthd', 'text_deal']].fillna('').apply(lambda x: ' '.join(x).strip(), axis=1)
labels_raw = file0[['digit_1','digit_2','digit_3']].values

In [None]:
sentences, sentences_test, labels, labels_test = train_test_split(sentences_raw, labels_raw, test_size=0.3, random_state=0)

### KoBART

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
print(torch.cuda.get_device_name(0))

In [None]:
tokenizer = get_kobart_tokenizer()
tokenized_texts = list(map(lambda x: tokenizer.tokenize(x, return_tensors='pt'), sentences))
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

In [None]:
def tokenize_inputs(text_list, tokenizer, num_embeddings=120):
    """
    Tokenizes the input text input into ids. Appends the appropriate special
    characters to the end of the text to denote end of sentence. Truncate or pad
    the appropriate sequence length.
    """
    tokenized_texts = list(map(lambda x: tokenizer.tokenize(x, return_tensors='pt'), text_list))
    # convert tokenized text into numeric ids for the appropriate LM
    input_ids = list(map(lambda x: tokenizer.convert_tokens_to_ids(x), tokenized_texts))
    # pad sequences
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

def create_attn_masks(input_ids):
    """
    Create attention masks to tell model whether attention should be applied to
    the input id tokens. Do not want to perform attention on padding tokens.
    """
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [None]:
input_ids = tokenize_inputs(sentences, tokenizer, num_embeddings=120)
attention_masks = create_attn_masks(input_ids)
input_ids = torch.from_numpy(input_ids)
attention_masks = torch.tensor(attention_masks)

labels1 = torch.tensor(labels[:,0])
labels2 = torch.tensor(labels[:,1])
labels3 = torch.tensor(labels[:,2])

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[1])
print('Token IDs:', input_ids[1])

In [None]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
dataset1 = TensorDataset(input_ids, attention_masks, labels1)
dataset2 = TensorDataset(input_ids, attention_masks, labels2)
dataset3 = TensorDataset(input_ids, attention_masks, labels3)

# Divide the dataset by randomly selecting samples.
train_dataset1, val_dataset1 = train_test_split(dataset1, test_size=0.3, random_state=0)
train_dataset2, val_dataset2 = train_test_split(dataset2, test_size=0.3, random_state=0)
train_dataset3, val_dataset3 = train_test_split(dataset3, test_size=0.3, random_state=0)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. Batch size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader1 = DataLoader(
            train_dataset1,  # The training samples.
            sampler = RandomSampler(train_dataset1), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
train_dataloader2 = DataLoader(
            train_dataset2,  # The training samples.
            sampler = RandomSampler(train_dataset2), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
train_dataloader3 = DataLoader(
            train_dataset3,  # The training samples.
            sampler = RandomSampler(train_dataset3), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader1 = DataLoader(
            val_dataset1, # The validation samples.
            sampler = SequentialSampler(val_dataset1), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
validation_dataloader2 = DataLoader(
            val_dataset2, # The validation samples.
            sampler = SequentialSampler(val_dataset2), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
validation_dataloader3 = DataLoader(
            val_dataset3, # The validation samples.
            sampler = SequentialSampler(val_dataset3), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
class KoBARTClassification(torch.nn.Module):
    def __init__(self, num_labels):
        super(KoBARTClassification, self).__init__()
        self.num_labels = num_labels
        self.kobart = BartModel.from_pretrained(get_pytorch_kobart_model())
        self.classifier = torch.nn.Linear(768, num_labels)

        torch.nn.init.xavier_normal_(self.classifier.weight)
    
    def forward(self, input_ids, attention_mask):
        last_hidden_state = self.kobart(input_ids=input_ids,\
                                   attention_mask=attention_mask)
        mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
        logits = self.classifier(mean_last_hidden_state)
        return logits

    def pool_hidden_state(self, last_hidden_state):
        """
        Pool the output vectors into a single mean vector 
        """
        last_hidden_state = last_hidden_state[0]
        mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        return mean_last_hidden_state

In [None]:
def train(model, num_epochs,\
          optimizer, criterion,\
          train_dataloader, valid_dataloader,\
          model_save_path,\
          train_loss_set=[], valid_loss_set = [],\
          lowest_eval_loss=None, start_epoch=0,\
          device="cpu"
          ):
  """
  Train the model and save the model with the lowest validation loss
  """
  # We'll store a number of quantities such as training and validation loss, 
  # validation accuracy, and timings.
  training_stats = []
  # Measure the total training time for the whole run.
  total_t0 = time.time()

  model.to(device)

  # trange is a tqdm wrapper around the normal python range
  for i in trange(num_epochs, desc="Epoch"):
    # if continue training from saved model
    actual_epoch = start_epoch + i

    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set. 
    print("")
    print('======== Epoch {:} / {:} ========'.format(actual_epoch, num_epochs))
    print('Training...')
    
    # Measure how long the training epoch takes.
    t0 = time.time()
    
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    num_train_samples = 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Progress update every 100 batches.
        if step % 100 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.    Train Loss: {:}.    Train Accuracy: {:}.'.format(step, len(train_dataloader), elapsed, loss.item(), avg_train_accuracy))
            
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        logits = model(b_input_ids, b_input_mask)
        loss = criterion(logits, b_labels)
        # store train loss
        tr_loss += loss.item()
        num_train_samples += b_labels.size(0)
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        #scheduler.step()
        # Accuracy
        prediction = logits.max(1, keepdim=True)[1]
        total_train_accuracy = prediction.eq(b_labels.view_as(prediction)).sum().item()
        avg_train_accuracy = total_train_accuracy / len(b_input_ids)

    # Update tracking variables
    epoch_train_loss = tr_loss/num_train_samples
    train_loss_set.append(epoch_train_loss)

#     print("Train loss: {}".format(epoch_train_loss))
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(epoch_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()
    
    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss = 0
    num_eval_samples = 0
    total_eval_accuracy = 0 

    # Evaluate data for one epoch
    for batch in valid_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate validation loss
            logits = model(b_input_ids, b_input_mask)
            loss = criterion(logits, b_labels)
            # store valid loss
            eval_loss += loss.item()
            num_eval_samples += b_labels.size(0)
            prediction = logits.max(1, keepdim=True)[1]
            total_eval_accuracy += prediction.eq(b_labels.view_as(prediction)).sum().item()

    epoch_eval_loss = eval_loss/num_eval_samples
    valid_loss_set.append(epoch_eval_loss)

#     print("Valid loss: {}".format(epoch_eval_loss))
    
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / num_eval_samples
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
#     avg_val_loss = total_eval_loss / num_eval_samples
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(epoch_eval_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': actual_epoch,
            'Training Loss': epoch_train_loss,
            'Valid. Loss': epoch_eval_loss,
             'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    
    if lowest_eval_loss == None:
      lowest_eval_loss = epoch_eval_loss
      # save model
      save_model(model, model_save_path, actual_epoch,\
                 lowest_eval_loss, train_loss_set, valid_loss_set)
    else:
      if epoch_eval_loss < lowest_eval_loss:
        lowest_eval_loss = epoch_eval_loss
        # save model
        save_model(model, model_save_path, actual_epoch,\
                   lowest_eval_loss, train_loss_set, valid_loss_set)
  
  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
  return model, train_loss_set, valid_loss_set, training_stats

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# function to save and load the model form a specific epoch
def save_model(model, save_path, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist):
  """
  Save the model to the path directory provided
  """
  model_to_save = model.module if hasattr(model, 'module') else model
  checkpoint = {'epochs': epochs, \
                'lowest_eval_loss': lowest_eval_loss,\
                'state_dict': model_to_save.state_dict(),\
                'train_loss_hist': train_loss_hist,\
                'valid_loss_hist': valid_loss_hist
               }
  torch.save(checkpoint, save_path)
  print("Saving model at epoch {} with validation loss of {}".format(epochs,\
                                                                     lowest_eval_loss))
  return
  
def load_model(save_path):
  """
  Load the model from the path directory provided
  """
  checkpoint = torch.load(save_path)
  model_state_dict = checkpoint['state_dict']
  model = KoBARTClassification(num_labels=model_state_dict["classifier.weight"].size()[0])
  model.load_state_dict(model_state_dict)

  epochs = checkpoint["epochs"]
  lowest_eval_loss = checkpoint["lowest_eval_loss"]
  train_loss_hist = checkpoint["train_loss_hist"]
  valid_loss_hist = checkpoint["valid_loss_hist"]
  
  return model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist

#### Digit_1
- 2 epoch

In [None]:
torch.cuda.empty_cache()

In [None]:
num_epochs = 3

model1 = KoBARTClassification(num_labels=len(labels1.unique())).to(device)
optimizer1 = torch.optim.AdamW(model1.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  # eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                 weight_decay=0.01,
                )
criterion1 = torch.nn.CrossEntropyLoss()

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

model_save_path = output_model_file = '/content/drive/MyDrive/공모전/models/kobart1.pt'

model1, train_loss_set1, valid_loss_set1, training_stats1 = train(model=model1,\
                                                              num_epochs=num_epochs,\
                                                              optimizer=optimizer1,\
                                                              criterion=criterion1,\
                                                              train_dataloader=train_dataloader1,\
                                                              valid_dataloader=validation_dataloader1,\
                                                              model_save_path=model_save_path,\
                                                              device="cuda"
                                                              )

#### Digit_2

In [None]:
torch.cuda.empty_cache()

In [None]:
num_epochs = 3

model2 = KoBARTClassification(num_labels=len(labels2.unique())).to(device)
optimizer2 = torch.optim.AdamW(model2.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  # eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                 weight_decay=0.01,
                )
criterion2 = torch.nn.CrossEntropyLoss()

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

model_save_path = output_model_file = '/content/drive/MyDrive/공모전/models/kobart2.pt'

model2, train_loss_set2, valid_loss_set2, training_stats2 = train(model=model2,\
                                                              num_epochs=num_epochs,\
                                                              optimizer=optimizer2,\
                                                              criterion=criterion2,\
                                                              train_dataloader=train_dataloader2,\
                                                              valid_dataloader=validation_dataloader2,\
                                                              model_save_path=model_save_path,\
                                                              device="cuda"
                                                              )

```python
train_loss_set1 = [0.0046230769887563, 0.002806043499286586, 0.0022028916692519228]
train_loss_set2 = [0.009797783909549898, 0.006345497326163708, 0.005217890849337457]
valid_loss_set2 = [0.007217050700880853, 0.007007619538549555, 0.006949771315930412]
 ```

#### Digit_3

In [None]:
torch.cuda.empty_cache()

In [None]:
num_epochs = 5

model3 = KoBARTClassification(num_labels=len(labels3.unique())).to(device)
optimizer3 = torch.optim.AdamW(model3.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  # eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                 weight_decay=0.01,
                )
criterion3 = torch.nn.CrossEntropyLoss()

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)
model_save_path = output_model_file = '/content/drive/MyDrive/공모전/models/kobart3.pt'

model3, train_loss_set3, valid_loss_set3, training_stats3 = train(model=model3,\
                                                              num_epochs=num_epochs,\
                                                              optimizer=optimizer3,\
                                                              criterion=criterion3,\
                                                              train_dataloader=train_dataloader3,\
                                                              valid_dataloader=validation_dataloader3,\
                                                              model_save_path=model_save_path,\
                                                              device="cuda"
                                                              )

```python
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)
```

#### Test

In [None]:
model1, epochs1, lowest_eval_loss1, train_loss_hist1, valid_loss_hist1 = load_model('/content/drive/MyDrive/공모전/models/kobart1.pt')
model2, epochs2, lowest_eval_loss2, train_loss_hist2, valid_loss_hist2 = load_model('/content/drive/MyDrive/공모전/models/kobart2.pt')
model3, epochs3, lowest_eval_loss3, train_loss_hist3, valid_loss_hist3 = load_model('/content/drive/MyDrive/공모전/models/kobart3.pt')

In [None]:
input_ids_test = tokenize_inputs(sentences_test, tokenizer, num_embeddings=120)
attention_masks_test = create_attn_masks(input_ids_test)
input_ids_test = torch.from_numpy(input_ids_test)
attention_masks_test = torch.tensor(attention_masks_test)

labels1_test = torch.tensor(labels_test[:,0])
labels2_test = torch.tensor(labels_test[:,1])
labels3_test = torch.tensor(labels_test[:,2])

In [None]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
test_dataset1 = TensorDataset(input_ids_test, attention_masks_test, labels1_test)
test_dataset2 = TensorDataset(input_ids_test, attention_masks_test, labels2_test)
test_dataset3 = TensorDataset(input_ids_test, attention_masks_test, labels3_test)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. Batch size of 16 or 32.
batch_size = 32

test_dataloader1 = DataLoader(
            test_dataset1, # The validation samples.
            sampler = SequentialSampler(test_dataset1), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
test_dataloader2 = DataLoader(
            test_dataset2, # The validation samples.
            sampler = SequentialSampler(test_dataset2), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
test_dataloader3 = DataLoader(
            test_dataset3, # The validation samples.
            sampler = SequentialSampler(test_dataset3), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
model = model1.to(device)
criterion = torch.nn.CrossEntropyLoss()

eval_loss = 0
num_eval_samples = 0
total_eval_accuracy=0
for batch in test_dataloader1:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate validation loss
        logits = model(b_input_ids, b_input_mask)
        loss = criterion(logits, b_labels)
        # store valid loss
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)
        prediction1 = logits.max(1, keepdim=True)[1]
        total_eval_accuracy += prediction1.eq(b_labels.view_as(prediction1)).sum().item()
epoch_eval_loss = eval_loss/num_eval_samples
avg_val_accuracy = total_eval_accuracy / num_eval_samples
print(f'Loss: {epoch_eval_loss}\t Accuracy: {avg_val_accuracy}')
# Loss: 0.0030046332769961253	 Accuracy: 0.9745533333333334

In [None]:
model = model2.to(device)
criterion = torch.nn.CrossEntropyLoss()

eval_loss = 0
num_eval_samples = 0
total_eval_accuracy=0
for batch in test_dataloader2:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate validation loss
        logits = model(b_input_ids, b_input_mask)
        loss = criterion(logits, b_labels)
        # store valid loss
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)
        prediction2 = logits.max(1, keepdim=True)[1]
        total_eval_accuracy += prediction2.eq(b_labels.view_as(prediction2)).sum().item()
epoch_eval_loss = eval_loss/num_eval_samples
avg_val_accuracy = total_eval_accuracy / num_eval_samples
print(f'Loss: {epoch_eval_loss}\t Accuracy: {avg_val_accuracy}')
# Loss: 0.00708571706997153	 Accuracy: 0.9391

In [None]:
model = model3.to(device)
criterion = torch.nn.CrossEntropyLoss()

eval_loss = 0
num_eval_samples = 0
total_eval_accuracy=0
for batch in test_dataloader3:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate validation loss
        logits = model(b_input_ids, b_input_mask)
        loss = criterion(logits, b_labels)
        # store valid loss
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)
        prediction3 = logits.max(1, keepdim=True)[1]
        total_eval_accuracy += prediction3.eq(b_labels.view_as(prediction3)).sum().item()
epoch_eval_loss = eval_loss/num_eval_samples
avg_val_accuracy = total_eval_accuracy / num_eval_samples
print(f'Loss: {epoch_eval_loss}\t Accuracy: {avg_val_accuracy}')
# Loss: 0.010448561390587129	 Accuracy: 0.9145666666666666

#### Submission

In [None]:
sentences_sub = file1[['text_obj','text_mthd', 'text_deal']].fillna('').apply(lambda x: ' '.join(x).strip(), axis=1)

In [None]:
input_ids_sub = tokenize_inputs(sentences_sub, tokenizer, num_embeddings=120)
attention_masks_sub = create_attn_masks(input_ids_sub)
input_ids_sub = torch.from_numpy(input_ids_sub)
attention_masks_sub = torch.tensor(attention_masks_sub)

In [None]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
dataset_sub = TensorDataset(input_ids_sub, attention_masks_sub)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. Batch size of 16 or 32.
batch_size = 32

sub_dataloader = DataLoader(
            dataset_sub, # The validation samples.
            sampler = SequentialSampler(dataset_sub), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
model = model1.to(device)

predictions1 = []
for batch in sub_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate validation loss
        logits = model(b_input_ids, b_input_mask)
        prediction1 = logits.max(1, keepdim=True)[1]
        predictions1.append(prediction1)

In [None]:
model = model2.to(device)

predictions2 = []
for batch in sub_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate validation loss
        logits = model(b_input_ids, b_input_mask)
        prediction2 = logits.max(1, keepdim=True)[1]
        predictions2.append(prediction2)

In [None]:
model = model3.to(device)

predictions3 = []
for batch in sub_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate validation loss
        logits = model(b_input_ids, b_input_mask)
        prediction3 = logits.max(1, keepdim=True)[1]
        predictions3.append(prediction3)

In [None]:
with open('/content/drive/MyDrive/공모전/submissions/pred1_kobart.pkl', 'wb') as f:
    pickle.dump(predictions1, f)
with open('/content/drive/MyDrive/공모전/submissions/pred2_kobart.pkl', 'wb') as f:
    pickle.dump(predictions2, f)
with open('/content/drive/MyDrive/공모전/submissions/pred3_kobart.pkl', 'wb') as f:
    pickle.dump(predictions3, f)

In [None]:
with open('/content/drive/MyDrive/공모전/submissions/pred1_kobart.pkl', 'rb') as f:
    predictions1 = pickle.load(f)
with open('/content/drive/MyDrive/공모전/submissions/pred2_kobart.pkl', 'rb') as f:
    predictions2 = pickle.load(f)
with open('/content/drive/MyDrive/공모전/submissions/pred3_kobart.pkl', 'rb') as f:
    predictions3 = pickle.load(f)

In [None]:
predictions1 = torch.cat(predictions1).squeeze().tolist()
predictions2 = torch.cat(predictions2).squeeze().tolist()
predictions3 = torch.cat(predictions3).squeeze().tolist()

In [None]:
predictions1 = [idx2label_digit1[pred] for pred in predictions1]
predictions2 = [idx2label_digit2[pred] for pred in predictions2]
predictions3 = [idx2label_digit3[pred] for pred in predictions3]

In [None]:
sub = file1.copy()
sub['digit_1'] = predictions1
sub['digit_2'] = predictions2
sub['digit_3'] = predictions3

In [None]:
sub.to_csv('/content/drive/MyDrive/공모전/submissions/sub_kobart_220329.csv', index=False)