## 0. GPU Setting

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## 1. Data load

In [2]:
import pandas as pd

train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [5]:
train = train[:30000]

## 2. Preprocessing

In [6]:
import re

def alpha_num(text):
    return re.sub(r'[^a-zA-z0-9\s]', '', text)

In [7]:
text_list = train['question_text'].str.lower().apply(alpha_num)
test_text_list = test['question_text'].str.lower().apply(alpha_num)

## 3. Train & validation split

In [8]:
import numpy as np

valid_percent = 0.2

data_len = len(train)
valid_index = np.random.choice(range(data_len), int(data_len*valid_percent), replace=False)
train_index = list(set(range(data_len)) - set(valid_index))
test_index = list(range(len(test)))

train_text_list = [text_list[i] for i in train_index]
valid_text_list = [text_list[i] for i in valid_index]
valid_text_list = [test_text_list[i] for i in test_index]
train_label_list = [train['target'].tolist()[i] for i in train_index]
valid_label_list = [train['target'].tolist()[i] for i in valid_index]

## 4. Parsing

In [9]:
import os

save_path = './save'

if not os.path.exists(save_path):
    os.mkdir(save_path)

In [10]:
import sentencepiece as spm

vocab_size = 12000
pad_idx = 0
bos_idx = 1
eos_idx = 2
unk_idx = 3

# 1) Make Korean text to train vocab
with open(f'{save_path}/text.txt', 'w') as f:
    for text in train_text_list:
        f.write(f'{text}\n')


# 2) SentencePiece model training
spm.SentencePieceProcessor()
spm.SentencePieceTrainer.Train(
    f'--input={save_path}/text.txt --model_prefix={save_path}/m_text '
    f'--vocab_size={vocab_size} --character_coverage=0.9995 '
    f'--model_type=bpe --split_by_whitespace=true '
    f'--pad_id={pad_idx} --unk_id={unk_idx} '
    f'--bos_id={bos_idx} --eos_id={eos_idx}'
)

vocab_list = list()
with open(f'{save_path}/m_text.vocab') as f:
    for line in f:
        vocab_list.append(line[:-1].split('\t')[0])
word2id_spm = {w: i for i, w in enumerate(vocab_list)}

In [11]:
# SentencePiece model load
spm_ = spm.SentencePieceProcessor()
spm_.Load(f"{save_path}/m_text.model")

# Tokenizing
train_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in train_text_list]
valid_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in valid_text_list]
test_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in test_text_list]

## 5. Custom dataset

In [12]:
import torch
from torch.utils.data.dataset import Dataset

class CustomDataset(Dataset):
    def __init__(self, src_list, trg_list, min_len=4, max_len=500):
        data = list()
        for src, trg in zip(src_list, trg_list):
            if min_len <= len(src) <= max_len:
                data.append((src, trg))

        self.data = data
        self.num_data = len(self.data)
    
    def __getitem__(self, index):
        src, trg = self.data[index]
        return src, trg

    def __len__(self):
        return self.num_data

class PadCollate:
    def __init__(self, pad_index=0, dim=0):
        self.dim = dim
        self.pad_index = pad_index

    def pad_collate(self, batch):
        def pad_tensor(vec, max_len, dim):
            pad_size = list(vec.shape)
            pad_size[dim] = max_len - vec.size(dim)
            return torch.cat([vec, torch.LongTensor(*pad_size).fill_(self.pad_index)], dim=dim)

        def pack_sentence(sentences):
            sentences_len = max(map(lambda x: len(x), sentences))
            sentences = [pad_tensor(torch.LongTensor(seq), sentences_len, self.dim) for seq in sentences]
            sentences = torch.cat(sentences)
            sentences = sentences.view(-1, sentences_len)
            return sentences

        src, trg = zip(*batch)
        return pack_sentence(src), torch.LongTensor(trg)

    def __call__(self, batch):
        return self.pad_collate(batch)

## 6. DataLoader

In [13]:
from torch.utils.data import DataLoader

batch_size = 8

dataset_dict = {
    'train': CustomDataset(train_encoded_list, train_label_list, min_len=4, max_len=500),
    'valid': CustomDataset(valid_encoded_list, valid_label_list, min_len=4, max_len=500)
}

dataloader_dict = {
    'train': DataLoader(dataset_dict['train'], collate_fn=PadCollate(), drop_last=True,
                        batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True),
    'valid': DataLoader(dataset_dict['valid'], collate_fn=PadCollate(), drop_last=True,
                        batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True)
}

print(f'Total number of trainingsets  iterations - {len(dataset_dict["train"])}, {len(dataloader_dict["train"])}')

Total number of trainingsets  iterations - 24000, 3000


## 7. Build model

In [14]:
from torch import nn
from torch.nn import functional as F

class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, src_vocab_num, d_embedding=256, trg_num=2, dropout=0.5):

        super(GRU, self).__init__()
        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.src_embedding = nn.Embedding(src_vocab_num, d_embedding, padding_idx=pad_idx)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(d_embedding, self.hidden_dim,
                         num_layers=self.n_layers,
                         batch_first=True)
        self.linear = nn.Linear(self.hidden_dim, trg_num)
        

    def forward(self, src):

        x = self.src_embedding(src)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.gru(x, h_0)
        h_t = x[:,-1,:]
        self.dropout(h_t)
        logit = self.linear(h_t)

        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRU(1, 256, vocab_size, d_embedding=256, trg_num=2, dropout=0.1)
model.to(device)

GRU(
  (src_embedding): Embedding(12000, 256, padding_idx=0)
  (dropout): Dropout(p=0.5, inplace=False)
  (gru): GRU(256, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=2, bias=True)
)

## 8. Optimizer setting

In [16]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

lr = 1e-2

optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, 
                              patience=len(dataloader_dict['train'])/1.5)
criterion = nn.CrossEntropyLoss()

## 9. Training

In [17]:
import time
from torch.nn.utils import clip_grad_norm_

num_epoch = 5

print_freq = 1500
best_val_loss = None

for e in range(num_epoch):
    start_time_e = time.time()
    print(f'Model Fitting: [{e+1}/{num_epoch}]')
    for phase in ['train', 'valid']:
        if phase == 'train':
            model.train()
            freq = 0
        if phase == 'valid':
            model.eval()
            val_loss = 0
            val_acc = 0

        for i, (src, trg) in enumerate(dataloader_dict[phase]):
            # Optimizer setting
            optimizer.zero_grad()

            # Source, Target sentence setting
            src = src.to(device)
            trg = trg.to(device)

            # Model / Calculate loss
            with torch.set_grad_enabled(phase == 'train'):
                predicted_logit = model(src)

                # If phase train, then backward loss and step optimizer and scheduler
                if phase == 'train':
                    loss = criterion(predicted_logit, trg)
                    loss.backward()
                    clip_grad_norm_(model.parameters(), 5)
                    optimizer.step()
                    scheduler.step(loss)

                    # Print loss value only training
                    if freq == print_freq or freq == 0 or i == len(dataloader_dict['train']):
                        total_loss = loss.item()
                        _, predicted = predicted_logit.max(dim=1)
                        accuracy = sum(predicted == trg).item() / predicted.size(0)
                        print("[Epoch:%d][%d/%d] train_loss:%5.3f | Accuracy:%2.3f | lr:%1.6f | spend_time:%5.2fmin"
                                % (e+1, i, len(dataloader_dict['train']), total_loss, accuracy, 
                                optimizer.param_groups[0]['lr'], (time.time() - start_time_e) / 60))
                        freq = 0
                    freq += 1
                if phase == 'valid':
                    loss = F.cross_entropy(predicted_logit, trg)
                    val_loss += loss.item()
                    _, predicted = predicted_logit.max(dim=1)
                    val_acc += sum(predicted == trg).item() / predicted.size(0)
        # Finishing iteration
        if phase == 'valid':
            val_loss /= len(dataloader_dict['valid'])
            val_acc /= len(dataloader_dict['valid'])
            print("[Epoch:%d] val_loss:%5.3f | Accuracy:%5.2f | spend_time:%5.2fmin"
                    % (e+1, val_loss, val_acc, (time.time() - start_time_e) / 60))
            if not best_val_loss or val_loss < best_val_loss:
                print("[!] saving model...")
                torch.save(model.state_dict(), 
                            os.path.join(save_path, f'model_saved.pt'))
                best_val_loss = val_loss

Model Fitting: [1/10]
[Epoch:1][0/3000] train_loss:0.678 | Accuracy:0.750 | lr:0.010000 | spend_time: 0.00min
[Epoch:1][1500/3000] train_loss:0.068 | Accuracy:1.000 | lr:0.010000 | spend_time: 0.13min
[Epoch:1] val_loss:0.225 | Accuracy: 0.94 | spend_time: 0.32min
[!] saving model...
Model Fitting: [2/10]
[Epoch:2][0/3000] train_loss:0.405 | Accuracy:0.875 | lr:0.010000 | spend_time: 0.00min
[Epoch:2][1500/3000] train_loss:0.089 | Accuracy:1.000 | lr:0.001000 | spend_time: 0.14min
[Epoch:2] val_loss:0.230 | Accuracy: 0.94 | spend_time: 0.31min
Model Fitting: [3/10]
[Epoch:3][0/3000] train_loss:0.052 | Accuracy:1.000 | lr:0.000100 | spend_time: 0.00min
[Epoch:3][1500/3000] train_loss:0.052 | Accuracy:1.000 | lr:0.000010 | spend_time: 0.13min
[Epoch:3] val_loss:0.228 | Accuracy: 0.94 | spend_time: 0.30min
Model Fitting: [4/10]
[Epoch:4][0/3000] train_loss:0.066 | Accuracy:1.000 | lr:0.000010 | spend_time: 0.00min
[Epoch:4][1500/3000] train_loss:0.435 | Accuracy:0.875 | lr:0.000001 | spen

## 10. Prediction

In [18]:
class CustomDataset_test(Dataset):
    def __init__(self, src_list, min_len=4, max_len=500):
        data = src_list
        self.data = data
        self.num_data = len(self.data)
    
    def __getitem__(self, index):
        src = self.data[index]
        return src

    def __len__(self):
        return self.num_data

class PadCollate_test:
    def __init__(self, pad_index=0, dim=0):
        self.dim = dim
        self.pad_index = pad_index

    def pad_collate(self, batch):
        def pad_tensor(vec, max_len, dim):
            pad_size = list(vec.shape)
            pad_size[dim] = max_len - vec.size(dim)
            return torch.cat([vec, torch.LongTensor(*pad_size).fill_(self.pad_index)], dim=dim)

        def pack_sentence(sentences):
            sentences_len = max(map(lambda x: len(x), sentences))
            sentences = [pad_tensor(torch.LongTensor(seq), sentences_len, self.dim) for seq in sentences]
            sentences = torch.cat(sentences)
            sentences = sentences.view(-1, sentences_len)
            return sentences

        src = batch
        return pack_sentence(src)

    def __call__(self, batch):
        return self.pad_collate(batch)

In [19]:
test_dataset_dict = {
    'test': CustomDataset_test(test_encoded_list, min_len=4, max_len=500)
}

test_dataloader_dict = {
    'test': DataLoader(test_dataset_dict['test'], collate_fn=PadCollate_test(), drop_last=False,
                        batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=False)
}

print(f'Total number of testsets  iterations - {len(test_dataset_dict["test"])}, {len(test_dataloader_dict["test"])}')

Total number of testsets  iterations - 375806, 46976


In [20]:
model.eval()
pred=[]
for src in test_dataloader_dict['test']:
    optimizer.zero_grad()

    src = src.to(device)
    predicted_logit = model(src)
    _, predicted = predicted_logit.max(dim=1)
    pred.append(predicted.tolist())

In [21]:
prediction = [j for i in pred for j in i]

In [23]:
prediction = [0 for i in range(375806)]

In [24]:
submission = pd.DataFrame({
        "qid": test["qid"],
        "prediction": prediction
    })
submission.to_csv('submission.csv',index=False)

ValueError: array length 37806 does not match index length 375806