In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import PreTrainedTokenizerFast, AutoTokenizer, AutoModelWithLMHead, GPT2ForSequenceClassification, GPT2LMHeadModel

warnings.filterwarnings('ignore')

In [2]:
tr = pd.read_csv('./data/train_data.csv', index_col='index')

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2")

model = GPT2ForSequenceClassification.from_pretrained("skt/kogpt2-base-v2")
model.score = torch.nn.Linear(768, 7)
model.cuda()

Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
      

In [4]:
class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document, label = str(record['title']), int(record['topic_idx'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float),
                'labels': np.array(label, dtype=np.int_)}
    
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document = str(record['title'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float)}

In [9]:
# train parameters
epochs = 5
batch_size = 32

In [10]:
# train loader
train_ds = TrainDataset(tr, tokenizer)
loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, shuffle=True)

In [11]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5, )
loss_fn = torch.nn.CrossEntropyLoss()


In [12]:
model.train()
for e in range(epochs):
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        ids, atts, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        ids = torch.tensor(ids).long().cuda()
        atts = torch.tensor(atts).long().cuda()
        labels = torch.tensor(labels).long().cuda()
        pred = model(ids, attention_mask=atts)
        loss = loss_fn(pred[0], labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        optimizer.step()
        total_loss += loss.item()
        
    scheduler.step()
    print(e, total_loss)

0 170.22163397513214
1 103.4733246586693
2 51.332353011697705
3 19.89738322488006


KeyboardInterrupt: 

In [13]:
# test loader
te = pd.read_csv('./data/test_data.csv', index_col='index')

test_ds = TestDataset(te, tokenizer)
test_loader = DataLoader(test_ds, 8)

In [16]:
preds = []
outs = []
model.eval()

for b in tqdm(test_loader):
    ids, atts = b['input_ids'], b['attention_mask']
    ids = torch.tensor(ids).long().cuda()
    atts = torch.tensor(atts).long().cuda()
    pred = model(ids, attention_mask=atts)
    preds += list(np.argmax(pred[0].detach().cpu().numpy(), 1))
    for o in pred[0].detach().cpu().numpy():
        outs.append(o)
#     break

100%|██████████| 1142/1142 [00:18<00:00, 61.00it/s]


In [24]:
outs

array([[  3.5867152 ,  -7.719656  ,   6.0066886 , ...,  -3.5780704 ,
         -7.7010326 ,  -7.5868034 ],
       [ -3.0637186 ,  -5.1965814 ,   2.7567677 , ...,  -4.8779426 ,
         -5.0445423 ,  -5.4209704 ],
       [  7.493676  ,  -3.713879  ,   6.1584907 , ...,  -9.186032  ,
         -1.4970856 ,   7.8802547 ],
       ...,
       [ -0.02202135,  -6.619943  ,  13.58706   , ...,  -3.345173  ,
        -10.388258  ,  -7.218099  ],
       [  7.6317534 ,   0.97240216,  13.625787  , ...,  -7.6673274 ,
        -10.608603  ,  -9.644565  ],
       [ -1.5063169 ,  -4.287837  ,  15.181264  , ...,  -9.245955  ,
         -6.955384  ,   7.166752  ]], dtype=float32)

In [34]:
import numpy as np

def softmax(x):
    
    max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
    e_x = np.exp(x - max) #subtracts each row with its max value
    sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
    f_x = e_x / sum 
    return f_x

sof = softmax(outs)
len(sof)

9131

In [35]:
kobert_y_df = pd.DataFrame(sof)

In [36]:
kobert_y_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,2.413045e-05,2.966679e-10,0.0002713604,0.9997045,1.866116e-08,3.022446e-10,3.38819e-10
1,2.413195e-10,2.859579e-11,8.135746e-08,0.9999999,3.932646e-11,3.329137e-11,2.284813e-11
2,0.3656058,4.961739e-06,0.09619432,8.253035e-07,2.085006e-08,4.55381e-05,0.5381484
3,0.1610013,1.462866e-06,2.738303e-05,2.436747e-07,2.882297e-06,2.019402e-11,0.8389668
4,4.890889e-10,7.645539e-11,1.902557e-07,0.9999998,7.234745e-11,9.95822e-12,7.670673e-12


In [37]:
kobert_y_df.to_csv("ensemble/kogpt2.csv")

In [20]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='index')
sub['topic_idx'] = preds
sub.head(20)

Unnamed: 0_level_0,topic_idx
index,Unnamed: 1_level_1
45654,3
45655,3
45656,5
45657,0
45658,3
45659,0
45660,5
45661,3
45662,4
45663,4


In [21]:
sub.to_csv('./gpt.csv')