## IMPORTS 

In [51]:
import random
import pandas as pd
import numpy as np
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext


SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# load spacy tokenizer
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

## LOAD PROCESSED TRAINING DATA FROM DISK

In [52]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

In [54]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# insincere questions: {:,}({:.2f}%) and # sincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# insincere questions: 1,225,312(93.81%) and # sincere questions: 80,810(6.19%)
# Test samples: 56,370(0.043% of train samples)


## Normalization

In [None]:
df_train["question_text"] = df_train["question_text"].str.lower()
df_test["question_text"] = df_test["question_text"].str.lower()

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

morepuncts = "".join(puncts)

In [None]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in morepuncts:
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


df_train["question_text"] = df_train["question_text"].fillna("_na_").values
df_test["question_text"] = df_test["question_text"].fillna("_na_").values

df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: x.split())
df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: x.split())

df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: clean_text(x))
df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: clean_text(x))

df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: clean_numbers(x))
df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: clean_numbers(x))

df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: replace_typical_misspell(x))

df_train.to_csv("train2.csv")
df_test.to_csv("test2.csv")

Progress: 100%|██████████| 1306122/1306122 [00:05<00:00, 239952.17it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 719036.66it/s]
Progress: 100%|██████████| 1306122/1306122 [00:34<00:00, 37987.77it/s]
Progress: 100%|██████████| 56370/56370 [00:01<00:00, 38068.51it/s]
Progress: 100%|██████████| 1306122/1306122 [00:14<00:00, 88115.56it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 86844.01it/s]
Progress: 100%|██████████| 1306122/1306122 [00:04<00:00, 318231.99it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 306066.39it/s]


## Create a pytorch dataset from the train samples and build a vocabulary using embedding vectors

In [None]:
# # load dataframe to csv

TEXT = data.Field(lower=True, batch_first=True,tokenize='spacy')#preprocessing=generate_bigrams)
LABEL = data.LabelField(dtype=torch.float)
qid = data.Field()

train_dataset = data.TabularDataset(path='train2.csv', format='csv',
                                      fields={'question_text': ('text',TEXT),
                                              'target': ('label',LABEL)})
final_test_dataset = data.TabularDataset(path='test2.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text', TEXT)})

In [None]:
TEXT.build_vocab(train_dataset, final_test_dataset, min_freq=3)
qid.build_vocab(final_test_dataset)

In [None]:
import torchtext
vec = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt', cache='./cache/')
# vec = torchtext.vocab.Vectors('wiki-news-300d-1M/wiki-news-300d-1M.vec', cache='./cache/')
TEXT.vocab.load_vectors(vec)

In [None]:
# TEXT.build_vocab(train_dataset, max_size=50000, vectors=vec)
LABEL.build_vocab(train_dataset)
TEXT.vocab.vectors.shape

## SPLIT DATA TO TRAINiNG AND VALIDATION SETS

In [None]:
train_dataset, test_dataset = train_dataset.split(split_ratio=0.9,random_state=random.seed(SEED))
train_dataset, valid_dataset = train_dataset.split(split_ratio=0.9,random_state=random.seed(SEED))

In [None]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(
    train_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),shuffle=True,sort=False, 
    device=device)
test_iterator = data.BucketIterator(
    test_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),train=False,sort=False,
    device=device)
valid_iterator = data.BucketIterator(
    valid_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),train=False,sort=False, 
    device=device)

In [None]:
# from https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/4
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, batch_first=False):
        super(SelfAttention, self).__init__()

        self.hidden_size = hidden_size
        self.batch_first = batch_first

        self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size), requires_grad=True)

        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.att_weights:
            nn.init.uniform_(weight, -stdv, stdv)

    def get_mask(self):
        pass

    def forward(self, inputs, lengths):
        if self.batch_first:
            batch_size, max_len = inputs.size()[:2]
        else:
            max_len, batch_size = inputs.size()[:2]
            
        # apply attention layer
        weights = torch.bmm(inputs,
                            self.att_weights  # (1, hidden_size)
                            .permute(1, 0)  # (hidden_size, 1)
                            .unsqueeze(0)  # (1, hidden_size, 1)
                            .repeat(batch_size, 1, 1) # (batch_size, hidden_size, 1)
                            )
    
        attentions = torch.softmax(F.relu(weights.squeeze()), dim=-1)

        # create mask based on the sentence lengths
        mask = torch.ones(attentions.size(), requires_grad=True).cuda()
        for i, l in enumerate(lengths):  # skip the first sentence
            if l < max_len:
                mask[i, l:] = 0

        # apply mask and renormalize attention scores (weights)
        masked = attentions * mask
        _sums = masked.sum(-1).unsqueeze(-1)  # sums per row
        
        attentions = masked.div(_sums)

        # apply attention weights
        weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()

        return representations, attentions

class BaselineLSTM(nn.Module):
    def __init__(self, embedding):
        super(BaselineLSTM, self).__init__()
                
        self.embedding = nn.Embedding.from_pretrained(embedding)
        
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)
        
        self.attention = SelfAttention(128*2, batch_first=True)
        
        self.fc = nn.Linear(128*2, 1)
        self.logit = nn.Linear(1, 1)

    def forward(self,x, x_len):
        x = self.embedding(x)
        x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True)

        out, (hidden, _) = self.lstm(x)
        
        x, lengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        x, _ = self.attention(x, lengths) 
        
        x = self.fc(x)
        x = self.logit(x).view(-1)
        
        return x

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
OUTPUT_DIM = 1
DROPOUT = 0.5

# model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = BaselineLSTM(pretrained_embedding).to(device)

optimizer = optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

criterion = nn.BCEWithLogitsLoss()

get_n_params(model)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

### Training

In [None]:
import torch.optim as optim

# optimizer = optim.Adam(model.parameters())
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                    lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y, th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def f1_score_model(preds, y,th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded = torch.sigmoid(preds).cpu().apply_(lambda x: 1 if x>=th else 0)
    rounded_preds = rounded

    return f1_score(y.cpu().numpy(),rounded_preds.cpu().numpy())


In [None]:
def train(model, iterator, optimizer, criterion,e):
    
    epoch_loss = 0
    epoch_acc = 0
    
    step = 0
    max_loss = 1e5
    no_improve_epoch = 0
    no_improve_in_previous_epoch = False
    fine_tuning = False
    train_record = []
    val_record = []
    losses = []

    model.train()
    
    for batch in iterator:
        step += 1
        model.train()
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)       

        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        if step % 500 == 0:
            model.eval()
            model.zero_grad()
            val_loss = []
            for val_batch in iter(valid_iterator):
                val_x = val_batch.text.cuda()
                val_y = val_batch.label.type(torch.Tensor).cuda()
                val_pred = model.forward(val_x).view(-1)
                val_loss.append(criterion(val_pred, val_y).cpu().data.numpy())
            val_record.append({'step': step, 'loss': np.mean(val_loss)})
            print('epcoh {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f} '.format(
                        e, step, np.mean(losses), val_record[-1]['loss']))
            if e >= 2:
                if val_record[-1]['loss'] <= max_loss:
                    save(m=model, info={'step': step, 'epoch': e, 'train_loss': np.mean(losses),
                                        'val_loss': val_record[-1]['loss']})
                    max_loss = val_record[-1]['loss']
                    no_improve_in_previous_epoch = False

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_f1(model, iterator, criterion,th=0.5):
    
    f1_scores = []
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            f1 = f1_score_model(predictions, batch.label,th=th)
            f1_scores.append(f1)
        
    return np.array(f1_scores).mean()


def save(m, info):
    torch.save(info, 'best_model.info')
    torch.save(m, 'best_model.m')
    
def load():
    m = torch.load('best_model.m')
    info = torch.load('best_model.info')
    return m, info

In [None]:
N_EPOCHS = 4

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,e=epoch)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

In [None]:
model, m_info = load()
m_info

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

In [None]:
score = evaluate_f1(model, test_iterator, criterion)
print(f'| f1 score: {score:.3f}')

In [None]:
model.eval()
val_pred = []
val_true = []
test_iterator.init_epoch()
for val_batch in iter(test_iterator):
    val_x = val_batch.text.cuda()
    val_true += val_batch.label.cpu().data.numpy().tolist()
    val_pred += torch.sigmoid(model.forward(val_x).view(-1)).cpu().data.numpy().tolist()

In [None]:
tmp = [0,0,0] # idx, cur, max
delta = 0
for tmp[0] in np.arange(0.1, 0.501, 0.01):
    tmp[1] = f1_score(val_true, np.array(val_pred)>tmp[0])
    if tmp[1] > tmp[2]:
        delta = tmp[0]
        tmp[2] = tmp[1]
print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))

In [None]:
model.eval()
model.zero_grad()
test_pred = []
test_id = []

final_test_iterator = torchtext.data.BucketIterator(dataset=final_test_dataset,
                                    batch_size=BATCH_SIZE,
                                    sort_key=lambda x: x.text.__len__(),train=False,sort=False)


for test_batch in iter(final_test_iterator):
    test_x = test_batch.text.cuda()
    test_pred += torch.sigmoid(model.forward(test_x).view(-1)).cpu().data.numpy().tolist()
    test_id += test_batch.qid.view(-1).data.numpy().tolist()