# Example2 | PyTorch

## 0. GPU Setting

In [99]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

In [2]:
# !kaggle competitions download -c quora-insincere-questions-classification
# !unzip ./data/quora-insincere-questions-classification.zip
# !pip install sentencepiece

## 1. Data load

In [73]:
import pandas as pd

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [132]:
len(test)

375806

In [75]:
train = train.iloc[:30000]

## 2. Preprocessing

In [6]:
import re

def alpha_num(text):
    return re.sub(r'[^a-zA-z0-9\s]', '', text)

In [18]:
text_list = train['question_text'].str.lower().apply(alpha_num)
test_text_list = test['question_text'].str.lower().apply(alpha_num)

## 3. Train & validation split

In [21]:
import numpy as np

valid_percent = 0.2

data_len = len(train)
test_index = list(range(len(test)))

valid_index = np.random.choice(range(data_len), int(data_len*valid_percent), replace=False)
train_index = list(set(range(data_len)) - set(valid_index))

train_text_list = [text_list[i] for i in train_index]
valid_text_list = [text_list[i] for i in valid_index]
test_text_list = [test_text_list[i] for i in test_index]

train_label_list = [train['target'].tolist()[i] for i in train_index]
valid_label_list = [train['target'].tolist()[i] for i in valid_index]

## 4. Parsing

In [9]:
import os

save_path = './save'

if not os.path.exists(save_path):
    os.mkdir(save_path)

In [10]:
import sentencepiece as spm

vocab_size = 12000
pad_idx = 0
bos_idx = 1
eos_idx = 2
unk_idx = 3

# 1) Make Korean text to train vocab
with open(f'{save_path}/text.txt', 'w') as f:
    for text in train_text_list:
        f.write(f'{text}\n')


# 2) SentencePiece model training
spm.SentencePieceProcessor()
spm.SentencePieceTrainer.Train(
    f'--input={save_path}/text.txt --model_prefix={save_path}/m_text '
    f'--vocab_size={vocab_size} --character_coverage=0.9995 '
    f'--model_type=bpe --split_by_whitespace=true '
    f'--pad_id={pad_idx} --unk_id={unk_idx} '
    f'--bos_id={bos_idx} --eos_id={eos_idx}'
)

vocab_list = list()
with open(f'{save_path}/m_text.vocab') as f:
    for line in f:
        vocab_list.append(line[:-1].split('\t')[0])
word2id_spm = {w: i for i, w in enumerate(vocab_list)}

In [28]:
# SentencePiece model load
spm_ = spm.SentencePieceProcessor()
spm_.Load(f"{save_path}/m_text.model")

# Tokenizing
train_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in train_text_list]
valid_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in valid_text_list]

## 5. Custom dataset

In [52]:
import torch
from torch.utils.data.dataset import Dataset

class CustomDataset(Dataset):
    def __init__(self, src_list, trg_list, min_len=4, max_len=500):
        data = list()
        for src, trg in zip(src_list, trg_list):
            if min_len <= len(src) <= max_len:
                data.append((src, trg))

        self.data = data
        self.num_data = len(self.data)
    
    def __getitem__(self, index):
        src, trg = self.data[index]
        return src, trg

    def __len__(self):
        return self.num_data

class PadCollate:
    def __init__(self, pad_index=0, dim=0):
        self.dim = dim
        self.pad_index = pad_index

    def pad_collate(self, batch):
        def pad_tensor(vec, max_len, dim):
            pad_size = list(vec.shape)
            pad_size[dim] = max_len - vec.size(dim)
            return torch.cat([vec, torch.LongTensor(*pad_size).fill_(self.pad_index)], dim=dim)

        def pack_sentence(sentences):
            sentences_len = max(map(lambda x: len(x), sentences))
            sentences = [pad_tensor(torch.LongTensor(seq), sentences_len, self.dim) for seq in sentences]
            sentences = torch.cat(sentences)
            sentences = sentences.view(-1, sentences_len)
            return sentences

        src, trg = zip(*batch)
        return pack_sentence(src), torch.LongTensor(trg)

    def __call__(self, batch):
        return self.pad_collate(batch)

## 6. DataLoader

In [60]:
from torch.utils.data import DataLoader

batch_size = 8

dataset_dict = {
    'train': CustomDataset(train_encoded_list, train_label_list, min_len=4, max_len=500),
    'valid': CustomDataset(valid_encoded_list, valid_label_list, min_len=4, max_len=500),
}

dataloader_dict = {
    'train': DataLoader(dataset_dict['train'], collate_fn=PadCollate(), drop_last=True,
                        batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True),
    'valid': DataLoader(dataset_dict['valid'], collate_fn=PadCollate(), drop_last=True,
                        batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True),
}

print(f'Total number of trainingsets  iterations - {len(dataset_dict["train"])}, {len(dataloader_dict["train"])}')

Total number of trainingsets  iterations - 24000, 3000


## 7. Build model

In [14]:
from torch import nn
from torch.nn import functional as F

class CustomModel(nn.Module):
    def __init__(self, src_vocab_num, trg_num=2, pad_idx=0, bos_idx=1, eos_idx=2, 
                 max_len=500, d_model=512, d_embedding=256, dropout=0.1):

        super(CustomModel, self).__init__()

        self.src_embedding = nn.Embedding(src_vocab_num, d_embedding, padding_idx=pad_idx)
        self.linear1 = nn.Linear(d_embedding, d_model)
        self.linear2 = nn.Linear(d_model, trg_num)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embedding_out = self.src_embedding(src)
        gap_out = embedding_out.mean(dim=1)
        linear_out = self.dropout(F.gelu(self.linear1(gap_out)))
        logit = self.linear2(linear_out)

        return logit

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CustomModel(vocab_size, trg_num=2, pad_idx=pad_idx, bos_idx=bos_idx, eos_idx=eos_idx,
                    max_len=500, d_model=512, d_embedding=256, dropout=0.1)
model.to(device)

CustomModel(
  (src_embedding): Embedding(12000, 256, padding_idx=0)
  (linear1): Linear(in_features=256, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

## 8. Optimizer setting

In [16]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

lr = 1e-2

optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, 
                              patience=len(dataloader_dict['train'])/1.5)
criterion = nn.CrossEntropyLoss()

## 9. Training

In [17]:
import time
from torch.nn.utils import clip_grad_norm_

num_epoch = 5

print_freq = 1500
best_val_loss = None

for e in range(num_epoch):
    start_time_e = time.time()
    print(f'Model Fitting: [{e+1}/{num_epoch}]')
    for phase in ['train', 'valid']:
        if phase == 'train':
            model.train()
            freq = 0
        elif phase == 'valid':
            model.eval()
            val_loss = 0
            val_acc = 0

        for i, (src, trg) in enumerate(dataloader_dict[phase]):
            # Optimizer setting
            optimizer.zero_grad()

            # Source, Target sentence setting
            src = src.to(device)
            trg = trg.to(device)

            # Model / Calculate loss
            with torch.set_grad_enabled(phase == 'train'):
                predicted_logit = model(src)
                # If phase train, then backward loss and step optimizer and scheduler
                if phase == 'train':
                    loss = criterion(predicted_logit, trg)
                    loss.backward()
                    clip_grad_norm_(model.parameters(), 5)
                    optimizer.step()
                    scheduler.step(loss)
                    # Print loss value only training
                    if freq == print_freq or freq == 0 or i == len(dataloader_dict['train']):
                        total_loss = loss.item()
                        _, predicted = predicted_logit.max(dim=1)
                        accuracy = sum(predicted == trg).item() / predicted.size(0)
                        print("[Epoch:%d][%d/%d] train_loss:%5.3f | Accuracy:%2.3f | lr:%1.6f | spend_time:%5.2fmin"
                                % (e+1, i, len(dataloader_dict['train']), total_loss, accuracy, 
                                optimizer.param_groups[0]['lr'], (time.time() - start_time_e) / 60))
                        freq = 0
                    freq += 1
                if phase == 'valid':
                    loss = F.cross_entropy(predicted_logit, trg)
                    val_loss += loss.item()
                    _, predicted = predicted_logit.max(dim=1)
                    val_acc += sum(predicted == trg).item() / predicted.size(0)
        # Finishing iteration
        if phase == 'valid':
            val_loss /= len(dataloader_dict['valid'])
            val_acc /= len(dataloader_dict['valid'])
            print("[Epoch:%d] val_loss:%5.3f | Accuracy:%5.2f | spend_time:%5.2fmin"
                    % (e+1, val_loss, val_acc, (time.time() - start_time_e) / 60))
            if not best_val_loss or val_loss < best_val_loss:
                print("[!] saving model...")
                torch.save(model.state_dict(), 
                            os.path.join(save_path, f'model_saved.pt'))
                best_val_loss = val_loss

Model Fitting: [1/5]
[Epoch:1][0/3000] train_loss:0.693 | Accuracy:0.625 | lr:0.010000 | spend_time: 0.03min
[Epoch:1][1500/3000] train_loss:0.340 | Accuracy:0.875 | lr:0.010000 | spend_time: 0.13min
[Epoch:1] val_loss:0.185 | Accuracy: 0.94 | spend_time: 0.28min
[!] saving model...
Model Fitting: [2/5]
[Epoch:2][0/3000] train_loss:0.056 | Accuracy:1.000 | lr:0.010000 | spend_time: 0.01min
[Epoch:2][1500/3000] train_loss:0.016 | Accuracy:1.000 | lr:0.010000 | spend_time: 0.12min
[Epoch:2] val_loss:0.184 | Accuracy: 0.94 | spend_time: 0.27min
[!] saving model...
Model Fitting: [3/5]
[Epoch:3][0/3000] train_loss:0.160 | Accuracy:1.000 | lr:0.010000 | spend_time: 0.01min
[Epoch:3][1500/3000] train_loss:0.644 | Accuracy:0.750 | lr:0.001000 | spend_time: 0.11min
[Epoch:3] val_loss:0.181 | Accuracy: 0.94 | spend_time: 0.27min
[!] saving model...
Model Fitting: [4/5]
[Epoch:4][0/3000] train_loss:0.109 | Accuracy:1.000 | lr:0.000100 | spend_time: 0.01min
[Epoch:4][1500/3000] train_loss:0.105 |

## 10. Submission

In [77]:
%%time

spm_ = spm.SentencePieceProcessor()
spm_.Load(f"{save_path}/m_text.model")

# Tokenizing
test_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in test_text_list]

CPU times: user 17.1 s, sys: 0 ns, total: 17.1 s
Wall time: 18.6 s


In [153]:
class CustomtestDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.num_data = len(self.data)
    
    def __getitem__(self, index):
        src = self.data[index]
        return src

    def __len__(self):
        return self.num_data
    
class test_PadCollate:
    def __init__(self, pad_index=0, dim=0):
        self.dim = dim
        self.pad_index = pad_index

    def pad_collate(self, batch):
        def pad_tensor(vec, max_len, dim):
            pad_size = list(vec.shape)
            pad_size[dim] = max_len - vec.size(dim)
            return torch.cat([vec, torch.LongTensor(*pad_size).fill_(self.pad_index)], dim=dim)

        def pack_sentence(sentences):
            sentences_len = max(map(lambda x: len(x), sentences))
            sentences = [pad_tensor(torch.LongTensor(seq), sentences_len, self.dim) for seq in sentences]
            sentences = torch.cat(sentences)
            sentences = sentences.view(-1, sentences_len)
            return sentences

        src = batch
        return pack_sentence(src)

    def __call__(self, batch):
        return self.pad_collate(batch)

In [184]:
dataset_test_dict = {
    'test' : CustomtestDataset(test_encoded_list)
}

dataloader_test_dict = {
    'test' : DataLoader(dataset_test_dict['test'], collate_fn=test_PadCollate(), drop_last=False,
                       batch_size = 1024, num_workers=4, shuffle=False)
}

In [185]:
# drop

len(dataloader_test_dict['test'])

367

In [254]:
from tqdm import tqdm

eval_list = []
model.eval()
for src in dataloader_test_dict['test']:
    src = src.to(device)
    predicted_logit = model(src)
    _, prediction = predicted_logit.max(dim=1)
    eval_list.append(prediction.tolist())
    

In [264]:
flatten_list = sum(eval_list, [])

In [267]:
sample_submission['prediction'] = flatten_list

In [269]:
sample_submission.to_csv("submission.csv")

In [271]:
!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
contradictory-my-dear-watson                   2030-07-01 23:59:00  Getting Started     Prizes         97           False  
gan-getting-started                            2030-07-01 23:59:00  Getting Started     Prizes        202           False  
tpu-getting-started                            2030-06-03 23:59:00  Getting Started  Knowledge        371           False  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2565           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      19936           False  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       5258           False  
connectx