## IMPORTS 

In [1]:
import random
import pandas as pd
import numpy as np
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext


SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# load spacy tokenizer
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

## LOAD PROCESSED TRAINING DATA FROM DISK

In [2]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

In [3]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# insincere questions: {:,}({:.2f}%) and # sincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# insincere questions: 1,225,312(93.81%) and # sincere questions: 80,810(6.19%)
# Test samples: 56,370(0.043% of train samples)


## Normalization

In [4]:
df_train["question_text"] = df_train["question_text"].fillna("_na_").values
df_test["question_text"] = df_test["question_text"].fillna("_na_").values

df_train.to_csv("train2.csv")
df_test.to_csv("test2.csv")

In [5]:
df_train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


## Create a pytorch dataset from the train samples and build a vocabulary using embedding vectors

In [6]:
# # load dataframe to csv

TEXT = data.Field(lower=True, batch_first=True,tokenize='spacy')#preprocessing=generate_bigrams)
LABEL = data.LabelField(dtype=torch.float)
qid = data.Field()

train_dataset = data.TabularDataset(path='train2.csv', format='csv',
                                      fields={'question_text': ('text',TEXT),
                                              'target': ('label',LABEL)})
final_test_dataset = data.TabularDataset(path='test2.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text', TEXT)})

In [7]:
TEXT.build_vocab(train_dataset, final_test_dataset, min_freq=3)
qid.build_vocab(final_test_dataset)

In [8]:
import torchtext
vec = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt', cache='./cache/')
# vec = torchtext.vocab.Vectors('wiki-news-300d-1M/wiki-news-300d-1M.vec', cache='./cache/')
TEXT.vocab.load_vectors(vec)

In [9]:
# TEXT.build_vocab(train_dataset, max_size=50000, vectors=vec)
LABEL.build_vocab(train_dataset)
TEXT.vocab.vectors.shape

torch.Size([72017, 300])

## SPLIT DATA TO TRAINiNG AND VALIDATION SETS

In [10]:
# train_dataset, test_dataset = train_dataset.split(split_ratio=0.9,random_state=random.seed(SEED))
train_dataset, valid_dataset = train_dataset.split(split_ratio=0.9,random_state=random.seed(SEED))

In [11]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(
    train_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),shuffle=True,sort=False, 
    device=device)
# test_iterator = data.BucketIterator(
#     test_dataset, 
#     batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),train=False,sort=False,
#     device=device)
valid_iterator = data.BucketIterator(
    valid_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),train=False,sort=False, 
    device=device)

In [12]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
NEG_INF = -10000
TINY_FLOAT = 1e-6

def mask_softmax(matrix, mask=None):

    if mask is None:
        result = F.softmax(matrix, dim=-1)
    else:
        mask_norm = ((1 - mask) * NEG_INF).to(matrix)
        for i in range(matrix.dim() - mask_norm.dim()):
            mask_norm = mask_norm.unsqueeze(1)
        result = F.softmax(matrix + mask_norm, dim=-1)

    return result


def mask_mean(seq, mask=None):

    if mask is None:
        return torch.mean(seq, dim=1)

    mask_sum = torch.sum(  # [b,msl,nc]->[b,nc]
        seq * mask.unsqueeze(-1).float(), dim=1)
    seq_len = torch.sum(mask, dim=-1)  # [b]
    mask_mean = mask_sum / (seq_len.unsqueeze(-1).float() + TINY_FLOAT)

    return mask_mean


def mask_max(seq, mask=None):

    if mask is None:
        return torch.mean(seq, dim=1)

    torch
    mask_max, _ = torch.max(  # [b,msl,nc]->[b,nc]
        seq + (1 - mask.unsqueeze(-1).float()) * NEG_INF,
        dim=1)

    return mask_max

class DynamicLSTM(nn.Module):

    def __init__(self, input_size, hidden_size=100,
                 num_layers=1, dropout=0., bidirectional=False):
        super(DynamicLSTM, self).__init__()

        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, bias=True,
            batch_first=True, dropout=dropout, bidirectional=bidirectional)

    def forward(self, x, seq_lens):
        # sort input by descending length
        _, idx_sort = torch.sort(seq_lens, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        x_sort = torch.index_select(x, dim=0, index=idx_sort)
        seq_lens_sort = torch.index_select(seq_lens, dim=0, index=idx_sort)

        # pack input
        x_packed = pack_padded_sequence(
            x_sort, seq_lens_sort, batch_first=True)

        # pass through rnn
        y_packed, _ = self.lstm(x_packed)

        # unpack output
        y_sort, length = pad_packed_sequence(y_packed, batch_first=True)

        # unsort output to original order
        y = torch.index_select(y_sort, dim=0, index=idx_unsort)

        return y

def seq_mask(seq_len, max_len):

    idx = torch.arange(max_len).to(seq_len).repeat(seq_len.size(0), 1)
    mask = torch.gt(seq_len.unsqueeze(1), idx).to(seq_len)

    return mask

class BiLSTM(nn.Module):
    def __init__(self, pretrained_lm, padding_idx, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(pretrained_lm)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)

    
    def forward(self, sents):
        x = self.embedding(sents)
        x = self.dropout(x)

        x = torch.transpose(x, dim0=1, dim1=0)
#         self.out = nn.Linear(hidden_dim, num_classes)
        lstm_out, (h_n, c_n) = self.lstm(x)

        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
        return y

In [241]:
class BiLSTM_att(nn.Module):
    def __init__(self, pretrained_lm, padding_idx, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM_att, self).__init__()
        
        num_classes = 1

        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(pretrained_lm)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.requires_grad = False
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)
        
        self.rnn = DynamicLSTM(
            self.embedding.embedding_dim, hidden_dim, num_layers=lstm_layer,
            dropout=dropout, bidirectional=True)
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)


        self.fc_att = nn.Linear(hidden_dim * 2, 1)

        self.fc = nn.Linear(hidden_dim * 6, hidden_dim)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_dim, 512)
        self.out2 = nn.Linear(1024, 512)
        self.loss = nn.BCEWithLogitsLoss()



    
    def forward(self, word_seq,seq_len):
        # mask
        max_seq_len = torch.max(seq_len)
        mask = seq_mask(seq_len, max_seq_len)  # [b,msl]
        
        x = self.embedding(word_seq)
        x = self.dropout(x)
        x = torch.transpose(x, dim0=1, dim1=0)
#         self.out = nn.Linear(hidden_dim, num_classes)
        lstm_out, (h_n, c_n) = self.lstm(x)

        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
#         print(y.shape)# [512, B]
        r = self.rnn(x, seq_len.cuda())  # [b,msl,e]->[b,msl,h*2]
 
        # attention
        att = self.fc_att(r).squeeze(-1)  # [b,msl,h*2]->[b,msl]
        att = mask_softmax(att, mask)  # [b,msl]
        r_att = torch.sum(att.unsqueeze(-1) * r, dim=1)  # [b,h*2]

        # pooling
        r_avg = mask_mean(r, mask)  # [b,h*2]
        r_max = mask_max(r, mask)  # [b,h*2]
        r = torch.cat([r_avg, r_max, r_att], dim=-1)  # [b,h*6]

        # feed-forward
        f = self.drop(self.act(self.fc(r)))  # [b,h*6]->[b,h]
        logits = self.out(f).squeeze(-1)  # [b,h]->[b]
        print(logits.shape)
        return logits


In [242]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
OUTPUT_DIM = 1
DROPOUT = 0.5

# model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = BiLSTM_att(TEXT.vocab.vectors, lstm_layer=2, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], hidden_dim=128,dropout=0.2).cuda()

print(model)

BiLSTM_att(
  (dropout): Dropout(p=0.2)
  (embedding): Embedding(72017, 300, padding_idx=1)
  (hidden2label): Linear(in_features=512, out_features=1, bias=True)
  (rnn): DynamicLSTM(
    (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (lstm): LSTM(300, 128, num_layers=2, dropout=0.2, bidirectional=True)
  (fc_att): Linear(in_features=256, out_features=1, bias=True)
  (fc): Linear(in_features=768, out_features=128, bias=True)
  (act): ReLU()
  (drop): Dropout(p=0.2)
  (out): Linear(in_features=128, out_features=512, bias=True)
  (out2): Linear(in_features=1024, out_features=512, bias=True)
  (loss): BCEWithLogitsLoss()
)


In [243]:
N_EPOCHS = 6

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,e=epoch)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    if stop_training == 1:
        break

torch.Size([2, 512])


ValueError: Target size (torch.Size([512])) must be the same as input size (torch.Size([2, 512]))

In [227]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0869,  0.1916,  0.1091,  ..., -0.0152,  0.1111,  0.2065],
        ...,
        [-0.2523, -0.1560, -0.0008,  ...,  0.0601,  0.3452,  0.2371],
        [ 0.0202,  0.1975, -0.0793,  ...,  0.0901, -0.6364,  0.2416],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

### Training

In [228]:
import torch.optim as optim

# optimizer = optim.Adam(model.parameters())
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                    lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y, th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def f1_score_model(preds, y,th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded = torch.sigmoid(preds).cpu().apply_(lambda x: 1 if x>=th else 0)
    rounded_preds = rounded

    return f1_score(y.cpu().numpy(),rounded_preds.cpu().numpy())


In [234]:
stop_training = 0
warmup_epoch = 2
step = 0
max_loss = 1e5
no_improve_in_previous_epoch = False
no_improve_epoch = 0
fine_tuning = False
train_record = []
val_record = []
losses = []

def train(model, iterator, optimizer, criterion,e):
    
    epoch_loss = 0
    epoch_acc = 0

    global warm_epoch
    global no_improve_in_previous_epoch
    global fine_tuning
    global step
    global max_loss
    global stop_training
    global no_improve_epoch
    global train_record
    global val_record
    global losses
    
    model.train()
    if e >= warmup_epoch:
        if no_improve_in_previous_epoch:
            no_improve_epoch += 1
            if no_improve_epoch >= 1:
                stop_training = 1
        else:
            no_improve_epoch = 0
        no_improve_in_previous_epoch = True
    if stop_training == 0:    
        if not fine_tuning and e >= warmup_epoch:
            model.embedding.weight.requires_grad = True        
            fine_tuning = True

        for batch in iterator:
            step += 1
            model.train()
            optimizer.zero_grad()
            predictions = model(batch.text,torch.tensor(batch.text.shape).cuda()).squeeze(1)       

            loss = model.loss(predictions, batch.label.float())

            acc = binary_accuracy(predictions, batch.label)

            loss.backward()

            optimizer.step()
            if step % 500 == 0:
                model.eval()
                model.zero_grad()
                val_loss = []
                for val_batch in iter(valid_iterator):
                    val_x = val_batch.text.cuda()
                    val_y = val_batch.label.type(torch.Tensor).cuda()
                    val_pred = model.forward(val_x,torch.tensor(val_x.shape).cuda()).view(-1)
                    val_loss.append(model.loss(val_pred, val_y.float()).cpu().data.numpy())
                val_record.append({'step': step, 'loss': np.mean(val_loss)})
                print('epoch {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f} '.format(
                            e+1, step, np.mean(losses), val_record[-1]['loss']))
                if e >= warmup_epoch:
                    if val_record[-1]['loss'] <= max_loss:
                        save(m=model, info={'step': step, 'epoch': e+1, 'train_loss': np.mean(losses),
                                            'val_loss': val_record[-1]['loss']})
                        max_loss = val_record[-1]['loss']
                        no_improve_in_previous_epoch = False

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text,torch.tensor(batch.text.shape).cuda()).squeeze(1)
            
            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_f1(model, iterator, criterion,th=0.5):
    
    f1_scores = []
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text,torch.tensor(batch.text.shape).cuda()).squeeze(1)
            
            f1 = f1_score_model(predictions, batch.label,th=th)
            f1_scores.append(f1)
        
    return np.array(f1_scores).mean()


def save(m, info):
    torch.save(info, 'best_model.info')
    torch.save(m, 'best_model.m')
    
def load():
    m = torch.load('best_model.m')
    info = torch.load('best_model.info')
    return m, info

In [235]:
N_EPOCHS = 6

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,e=epoch)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    if stop_training == 1:
        break

ValueError: Target size (torch.Size([52])) must be the same as input size (torch.Size([512]))

In [None]:
model, m_info = load()
m_info

In [None]:
test_loss, test_acc = evaluate(model, valid_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

In [None]:
score = evaluate_f1(model, valid_iterator, criterion)
print(f'| f1 score: {score:.3f}')

In [None]:
model.eval()
val_pred = []
val_true = []
valid_iterator.init_epoch()
for val_batch in iter(valid_iterator):
    val_x = val_batch.text.cuda()
    val_true += val_batch.label.cpu().data.numpy().tolist()
    val_pred += torch.sigmoid(model.forward(val_x).view(-1)).cpu().data.numpy().tolist()

In [None]:
tmp = [0,0,0] # idx, cur, max
delta = 0
for tmp[0] in np.arange(0.1, 0.501, 0.01):
    tmp[1] = f1_score(val_true, np.array(val_pred)>tmp[0])
    if tmp[1] > tmp[2]:
        delta = tmp[0]
        tmp[2] = tmp[1]
print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))

In [None]:
model.eval()
model.zero_grad()
test_pred = []
test_id = []

final_test_iterator = torchtext.data.BucketIterator(dataset=final_test_dataset,
                                    batch_size=BATCH_SIZE,
                                    sort_key=lambda x: x.text.__len__(),train=False,sort=False)


for test_batch in iter(final_test_iterator):
    test_x = test_batch.text.cuda()
    test_pred += torch.sigmoid(model.forward(test_x).view(-1)).cpu().data.numpy().tolist()
    test_id += test_batch.qid.view(-1).data.numpy().tolist()

In [None]:
sub_df =pd.DataFrame()
sub_df['qid'] = [qid.vocab.itos[i] for i in test_id]
sub_df['prediction'] = (np.array(test_pred) >= delta).astype(int)

In [None]:
sub_df.to_csv("submission.csv", index=False)