In [None]:
FOLD = 0

import os
import random
import time
import math
import requests
import glob
import gc

import ast

import numpy as np
import pandas as pd

import mlcrate as mlc

import os

import cv2

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score, f1_score

from skimage.transform import resize

from PIL import Image, ImageDraw

from tqdm import tqdm, tqdm_notebook

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.optim import Optimizer
from torch.utils.data import Dataset, DataLoader
import torch.utils.checkpoint as checkpoint

import torchvision
from torchvision import transforms, utils

import torchtext
import torchtext.data as data

from torch.nn.utils.rnn import pad_sequence

import spacy
from spacy.lang.en import English

SEED = 1337

NOTIFY_EACH_EPOCH = False

WORKERS = 0
BATCH_SIZE = 512

N_SPLITS = 10

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

# from https://github.com/floydhub/save-and-resume
def save_checkpoint(state):
    """Save checkpoint if a new best is achieved"""
    print (" Saving checkpoint")

    filename = f'./checkpoint-{state["epoch"]}.pt.tar'
    torch.save(state, filename)
    
def initialize(model, path=None, optimizer=None):   
    if path == None:
        checkpoints = glob.glob('./*.pt.tar')
        path = checkpoints[np.argmax([int(checkpoint.split('checkpoint-')[1].split('.')[0]) for checkpoint in checkpoints])]
    
    checkpoint = torch.load(path)

    model.load_state_dict(checkpoint['model'])

    print(f' Loaded checkpoint {path} | Trained for {checkpoint["epoch"] + 1} epochs')
    
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer'])
          
        epoch = checkpoint['epoch'] + 1
        train_iteration = checkpoint['train_iteration']
        val_iteration = checkpoint['val_iteration']

        return model, optimizer, epoch, train_iteration, val_iteration
    else:
        return model

In [None]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
# From https://www.kaggle.com/spirosrap/bilstm-attention-kfold-clr-extra-features-capsule/notebook
def add_features(df):
    df2 = df.copy(deep=True)
    df2['question_text'] = df2['question_text'].apply(lambda x:str(x))
    df2['total_length'] = df2['question_text'].apply(len)
    df2['capitals'] = df2['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df2['caps_vs_length'] = df2.apply(lambda row: float(row['capitals'])/float(row['total_length']), axis=1)
    df2['num_words'] = df2.question_text.str.count('\S+')
    df2['num_unique_words'] = df2['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df2['words_vs_unique'] = df2['num_unique_words'] / df2['num_words']  

    return np.concatenate((df2['caps_vs_length'].values.reshape(-1, 1), df2['words_vs_unique'].values.reshape(-1, 1)), axis=1)

In [None]:
kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

train_idx, val_idx = list(kfold.split(train))[FOLD]
x_train, x_val = train.iloc[train_idx], train.iloc[val_idx]
# x_train, x_val = train.loc[:1000], train.loc[1000:2000]

x_train_meta = add_features(x_train)
x_val_meta = add_features(x_val)
test_meta = add_features(test)

x_train = x_train.reset_index()
x_val = x_val.reset_index()
x_test = test.reset_index()

x_train.to_csv('train.csv')
x_val.to_csv('val.csv')
x_test.to_csv('test.csv')

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\','•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

In [None]:
nlp = English()
def tokenize(sentence):
#     sentence = sentence.split()
#     return sentence

    sentence = str(sentence)
    for punct in puncts:
        sentence = sentence.replace(punct, f' {punct} ')
        
    x = nlp(sentence)
    return [token.text for token in x]

In [None]:
%%time
# from http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext
index_field = data.Field(sequential=False, use_vocab=False, batch_first=True)
question_field = data.Field(tokenize=tokenize, lower=True, batch_first=True, include_lengths=True)
target_field = data.Field(sequential=False, use_vocab=False, batch_first=True)

train_fields = [
    ('id', index_field),
    ('index', None),
    ('qid', None),
    ('question_text', question_field),
    ('target', target_field)
]

test_fields = [
    ('id', index_field),
    ('index', None),
    ('qid', None),
    ('question_text', question_field)
]

train_dataset, val_dataset = data.TabularDataset.splits('./', train='train.csv', validation='val.csv', format='CSV', skip_header=True, fields=train_fields)
test_dataset = data.TabularDataset('./test.csv', format='CSV', skip_header=True, fields=test_fields)

question_field.build_vocab(train_dataset, val_dataset, max_size=95000)

train_dataloader, val_dataloader = data.BucketIterator.splits((train_dataset, val_dataset), (BATCH_SIZE, BATCH_SIZE), sort_key=lambda x: len(x.question_text), sort_within_batch=True)
test_dataloader = data.BucketIterator(test_dataset, 1024, sort=False, shuffle=False)

print(f'Train Dataset: {len(train_dataset)}')
print(f'Val Dataset: {len(val_dataset)}')
print(f'Test Dataset: {len(test_dataset)}')

In [None]:
len(question_field.vocab.itos)

In [None]:
# from https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/4
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, batch_first=False):
        super(SelfAttention, self).__init__()

        self.hidden_size = hidden_size
        self.batch_first = batch_first

        self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size), requires_grad=True)
        nn.init.xavier_uniform_(self.att_weights.data)

    def get_mask(self):
        pass

    def forward(self, inputs, lengths):
        if self.batch_first:
            batch_size, max_len = inputs.size()[:2]
        else:
            max_len, batch_size = inputs.size()[:2]
            
        # apply attention layer
        weights = torch.bmm(inputs,
                            self.att_weights  # (1, hidden_size)
                            .permute(1, 0)  # (hidden_size, 1)
                            .unsqueeze(0)  # (1, hidden_size, 1)
                            .repeat(batch_size, 1, 1) # (batch_size, hidden_size, 1)
                            )
    
        attentions = torch.softmax(F.relu(weights.squeeze()), dim=-1)
        
        # create mask based on the sentence lengths
        mask = torch.ones(attentions.size(), requires_grad=True).cuda()
        for i, l in enumerate(lengths):  # skip the first sentence
            if l < max_len:
                mask[i, l:] = 0

        # apply mask and renormalize attention scores (weights)
        masked = attentions * mask
        _sums = masked.sum(-1).unsqueeze(-1)  # sums per row
        
        attentions = masked.div(_sums)

        # apply attention weights
        weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()

        return representations, attentions

class net(nn.Module):
    def __init__(self, embedding):
        super(net, self).__init__()
                
        self.embedding = nn.Embedding.from_pretrained(embedding)
        
        self.skip_lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)        
        self.skip_attention = SelfAttention(128*2, batch_first=True)

        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)
        self.attention = SelfAttention(128*2, batch_first=True)
        
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.max_pool = nn.AdaptiveMaxPool1d(1)
        
        self.fc = nn.Linear(128*2*4+2, 1)
        self.logit = nn.Linear(1, 1)

    def forward(self, x, x_meta, x_len):
        x = self.embedding(x)
        x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True)

        skip_lstm, _ = self.skip_lstm(x)
        lstm, _ = self.lstm(x)
        
        lstm, lstm_lengths = nn.utils.rnn.pad_packed_sequence(lstm, batch_first=True)
        skip_lstm, skip_lstm_lengths = nn.utils.rnn.pad_packed_sequence(skip_lstm, batch_first=True)
        
        skip_attention, _ = self.skip_attention(skip_lstm, skip_lstm_lengths)
        attention, _ = self.attention(lstm, lstm_lengths)
        
        avg_pool = self.avg_pool(lstm.transpose(1, 2))
        max_pool = self.max_pool(lstm.transpose(1, 2))
                                
        x = torch.cat([
            skip_attention.view(-1, 128*2),
            attention.view(-1, 128*2),
            avg_pool.view(-1, 128*2),
            max_pool.view(-1, 128*2),
            x_meta.view(-1, 2)
        ], dim=1)
                
        x = self.fc(x)
        x = self.logit(x).view(-1)
        
        return x

In [None]:
def choose_threshold(val_preds, y_val):
    thresholds = np.arange(0.1, 0.501, 0.01)

    val_scores = []
    for threshold in thresholds:
        threshold = np.round(threshold, 2)
        f1 = f1_score(y_val, (val_preds > threshold).astype(int))
        val_scores.append(f1)

    best_val_f1 = np.max(val_scores)
    best_threshold = np.round(thresholds[np.argmax(val_scores)], 2)
    
    return best_threshold, best_val_f1

In [None]:
def test(net, question_field, test_dataloader):
    model = net(question_field.vocab.vectors).to(device)
    model = initialize(model)

    preds = []

    model.eval()
    with torch.no_grad():
        for i, batch in tqdm_notebook(enumerate(test_dataloader)):
            (questions, lengths), index = batch.question_text, batch.id

            # sort questions and lengths in descending order
            indices = torch.argsort(lengths, descending=True)

            questions = questions[indices]
            index = index[indices]
            lengths = lengths[indices]

            questions = questions.to(device)
            meta = torch.from_numpy(test_meta[index]).float().to(device)

            out = model(questions, meta, lengths)
            
            # unsort outputs
            indices = torch.argsort(indices)
            out = out[indices]

            out = torch.sigmoid(out)
            pred = out.detach().cpu().numpy()
            preds.append(pred)

    preds = np.concatenate(preds, axis=0).reshape(-1, 1)
    
    return preds

In [None]:
def train(train_dataset, val_dataset, train_dataloader, val_dataloader, train_meta, val_meta, net, question_field, vectors, stoi):  
    best_train_loss = 1e10
    best_val_loss = 1e10

    best_epoch = 0
    epochs = 5

    timer = mlc.time.Timer()
    logger = mlc.LinewiseCSVWriter('train_log.csv', header=['epoch', 'lr', 'train_loss', 'val_loss'])

    question_field.vocab.set_vectors(stoi, vectors, 300)
    
    model = net(question_field.vocab.vectors).to(device)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        print(f'\nStarting Epoch {epoch}')

        train_loss = 0
        val_loss = 0

        y_train = []
        train_preds = []

        timer.add(epoch)

        model.train()
        for i, batch in tqdm_notebook(enumerate(train_dataloader), total=(int(len(train_dataset) / BATCH_SIZE))):
            (question, length), index, label = batch.question_text, batch.id, batch.target

            question = question.to(device)
            meta = torch.from_numpy(train_meta[index]).float().to(device)
            label = label.to(device).float()
            
            out = model(question, meta, length)

            loss = criterion(out, label)

            train_loss += loss.item()

            optimizer.zero_grad()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

            optimizer.step()

            y_train.append(label.detach())
            train_preds.append(out.detach())

        model.eval()
        with torch.no_grad():

            y_val = []
            val_preds = []

            for j, batch in tqdm_notebook(enumerate(val_dataloader), total=(int(len(val_dataset) / BATCH_SIZE))):
                (question, length), index, label = batch.question_text,batch.id, batch.target
                
                question = question.to(device)
                meta = torch.from_numpy(val_meta[index]).float().to(device)
                label = label.to(device).float()
                
                out = model(question, meta, length)

                loss = criterion(out, label)

                val_loss += loss.item()

                optimizer.zero_grad()

                y_val.append(label.detach())
                val_preds.append(out.detach())

        train_loss /= (i + 1)
        val_loss /= (j + 1)

        y_train = torch.cat(y_train, dim=0).reshape(-1, 1)
        y_val = torch.cat(y_val, dim=0).reshape(-1, 1)

        train_preds = torch.cat(train_preds, dim=0).reshape(-1, 1)
        val_preds = torch.cat(val_preds, dim=0).reshape(-1, 1)

        logger.write([epoch, optimizer.param_groups[0]['lr'], train_loss, val_loss])

        print(f'{timer.fsince(epoch)} | End of Epoch {epoch} | Train Loss: {train_loss} | Val Loss: {val_loss}')

        if val_loss < best_val_loss:
            best_epoch = epoch

            best_train_loss = train_loss
            best_val_loss = val_loss

            best_val_preds = val_preds

            save_checkpoint({
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch
            })
            
    message = f'Training Finished | Best Epoch {best_epoch} | Best Train Loss: {best_train_loss} | Best Val Loss: {best_val_loss}'
                        
    return best_val_preds, y_val, best_epoch, best_train_loss, best_val_loss, message

In [None]:
glove_vectors = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')#, max_vectors=1000)

for file in os.listdir('./.vector_cache/'):
    os.remove(f'./.vector_cache/{file}')

In [None]:
paragram_vectors = torchtext.vocab.Vectors('../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt')#, max_vectors=1000)

for file in os.listdir('./.vector_cache/'):
    os.remove(f'./.vector_cache/{file}')

In [None]:
%%time
mean_vectors = torch.zeros((len(question_field.vocab.stoi), 300))

for word, i in tqdm_notebook(question_field.vocab.stoi.items(), total=len(question_field.vocab.stoi)):
    glove_vector = glove_vectors[word]
    paragram_vector = paragram_vectors[word]
    
    vector = torch.stack([glove_vector, paragram_vector])
    vector = torch.sum(vector, dim=0).reshape(1, -1) / 2
    
    mean_vectors[i] = vector
    
del glove_vectors, paragram_vectors
gc.collect()

In [None]:
val_preds, y_val, _, _, _, message = train(train_dataset, val_dataset, train_dataloader, val_dataloader, x_train_meta, x_val_meta, net, question_field, mean_vectors, question_field.vocab.stoi)

In [None]:
# del mean_vectors
# gc.collect()

In [None]:
log = pd.read_csv('train_log.csv')
plt.plot(log['epoch'], log['train_loss'], log['val_loss'])
plt.show()

best_threshold, best_val_f1 = choose_threshold(torch.sigmoid(val_preds).cpu().numpy(), y_val.cpu().numpy())
preds = test(net, question_field, test_dataloader)

# Remove all saved model files
for file in os.listdir('./'):
    if file.endswith('.pt.tar'):
        os.remove(f'./{file}')

for file in os.listdir('./.vector_cache/'):
    os.remove(f'./.vector_cache/{file}')

In [None]:
print(message)
print(best_threshold, best_val_f1)

In [None]:
preds = (preds > best_threshold).astype(int)

sample_submission['prediction'] = preds
mlc.kaggle.save_sub(sample_submission, 'submission.csv')

sample_submission.head()

In [None]:
x_test['target'] = preds
pseudo_df = pd.concat([x_train, x_val, x_test]).reset_index().drop('level_0', axis=1)
pseudo_df.to_csv('x_pseudo.csv')

In [None]:
pseudo_meta = np.concatenate((x_train_meta, x_val_meta, test_meta))
pseudo_dataset = data.TabularDataset('./x_pseudo.csv', format='CSV', skip_header=True, fields=train_fields)
pseudo_dataloader = data.BucketIterator(pseudo_dataset, 512, sort_key=lambda x: len(x.question_text), sort_within_batch=True)

In [None]:
pseudo_val_preds, pseudo_y_val, _, _, _, message = train(pseudo_dataset, val_dataset, pseudo_dataloader, val_dataloader, pseudo_meta, x_val_meta, net, question_field, mean_vectors, question_field.vocab.stoi)

In [None]:
del mean_vectors
gc.collect()

In [None]:
log = pd.read_csv('train_log.csv')
plt.plot(log['epoch'], log['train_loss'], log['val_loss'])
plt.show()

pseudo_best_threshold, pseudo_best_val_f1 = choose_threshold(torch.sigmoid(pseudo_val_preds).cpu().numpy(), pseudo_y_val.cpu().numpy())
pseudo_preds = test(net, question_field, test_dataloader)

# Remove all saved model files
for file in os.listdir('./'):
    if file.endswith('.pt.tar'):
        os.remove(f'./{file}')

for file in os.listdir('./.vector_cache/'):
    os.remove(f'./.vector_cache/{file}')

In [None]:
print(message)
print(pseudo_best_threshold, pseudo_best_val_f1)

In [None]:
pseudo_preds = (pseudo_preds > pseudo_best_threshold).astype(int)

sample_submission['prediction'] = pseudo_preds
mlc.kaggle.save_sub(sample_submission, 'submission.csv')

sample_submission.head()