In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import gensim.models
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data
import copy
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

In [None]:
np.random.seed(481945)

In [None]:
from sklearn.metrics import pairwise

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics

In [None]:
!ls

In [None]:
from torch.autograd import Variable

from collections import OrderedDict
import numpy as np


def summary(model, input_size, batch_size=-1, device="cuda", input_type=torch.float32):

    def register_hook(module):

        def hook(module, input, output):
            class_name = str(module.__class__).split(".")[-1].split("'")[0]
            module_idx = len(summary)

            m_key = "%s-%i" % (class_name, module_idx + 1)
            summary[m_key] = OrderedDict()
            summary[m_key]["input_shape"] = list(input[0].size())
            summary[m_key]["input_shape"][0] = batch_size
            if isinstance(output, (list, tuple)):
                summary[m_key]["output_shape"] = [
                    [-1] + list(o.size())[1:] for o in output
                ]
            else:
                summary[m_key]["output_shape"] = list(output.size())
                summary[m_key]["output_shape"][0] = batch_size

            params = 0
            if hasattr(module, "weight") and hasattr(module.weight, "size"):
                params += torch.prod(torch.LongTensor(list(module.weight.size())))
                summary[m_key]["trainable"] = module.weight.requires_grad
            if hasattr(module, "bias") and hasattr(module.bias, "size"):
                params += torch.prod(torch.LongTensor(list(module.bias.size())))
            summary[m_key]["nb_params"] = params

        if (
            not isinstance(module, nn.Sequential)
            and not isinstance(module, nn.ModuleList)
            and not (module == model)
        ):
            hooks.append(module.register_forward_hook(hook))

    device = device.lower()
    assert device in [
        "cuda",
        "cpu",
    ], "Input device is not valid, please specify 'cuda' or 'cpu'"

    if device == "cuda" and torch.cuda.is_available():
        dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor

    # # multiple inputs to the network
    # if isinstance(input_size, tuple):
    #     input_size = [input_size]

    # batch_size of 2 for batchnorm
  #  x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
    net_device = next(model.parameters()).device
    x = torch.zeros((1,) + input_size ,dtype=input_type).to(net_device)
    # print(type(x[0]))

    # create properties
    summary = OrderedDict()
    hooks = []

    # register hook
    model.apply(register_hook)

    # make a forward pass
    # print(x.shape)
    model(x)

    # remove these hooks
    for h in hooks:
        h.remove()

    print("----------------------------------------------------------------")
    line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
    print(line_new)
    print("================================================================")
    total_params = 0
    total_output = 0
    trainable_params = 0
    for layer in summary:
        # input_shape, output_shape, trainable, nb_params
        line_new = "{:>20}  {:>25} {:>15}".format(
            layer,
            str(summary[layer]["output_shape"]),
            "{0:,}".format(summary[layer]["nb_params"]),
        )
        total_params += summary[layer]["nb_params"]
        total_output += np.prod(summary[layer]["output_shape"])
        if "trainable" in summary[layer]:
            if summary[layer]["trainable"] == True:
                trainable_params += summary[layer]["nb_params"]
        print(line_new)

    # assume 4 bytes/number (float on cuda).
    total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
    total_output_size = abs(2. * total_output * 4. / (1024 ** 2.))  # x2 for gradients
    total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
    total_size = total_params_size + total_output_size + total_input_size

    print("================================================================")
    print("Total params: {0:,}".format(total_params))
    print("Trainable params: {0:,}".format(trainable_params))
    print("Non-trainable params: {0:,}".format(total_params - trainable_params))
    print("----------------------------------------------------------------")
    print("Input size (MB): %0.2f" % total_input_size)
    print("Forward/backward pass size (MB): %0.2f" % total_output_size)
    print("Params size (MB): %0.2f" % total_params_size)
    print("Estimated Total Size (MB): %0.2f" % total_size)
    print("----------------------")

In [None]:
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Optimizer
import math

class CyclicLR(_LRScheduler):
    """Sets the learning rate of each parameter group according to
    cyclical learning rate policy (CLR). The policy cycles the learning
    rate between two boundaries with a constant frequency, as detailed in
    the paper `Cyclical Learning Rates for Training Neural Networks`_.
    The distance between the two boundaries can be scaled on a per-iteration
    or per-cycle basis.
    Cyclical learning rate policy changes the learning rate after every batch.
    `step` should be called after a batch has been used for training.
    To resume training, save `last_batch_iteration` and use it to instantiate `CycleLR`.
    This class has three built-in policies, as put forth in the paper:
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each
        cycle iteration.
    This implementation was adapted from the github repo: `bckenstler/CLR`_
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        base_lr (float or list): Initial learning rate which is the
            lower boundary in the cycle for eachparam groups.
            Default: 0.001
        max_lr (float or list): Upper boundaries in the cycle for
            each parameter group. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore
            max_lr may not actually be reached depending on
            scaling function. Default: 0.006
        step_size_up (int): Number of training iterations in the
            increasing half of a cycle. Default: 2000
        step_size_down (int): Number of training iterations in the
            decreasing half of a cycle. If step_size_down is None,
            it is set to step_size_up. Default: None
        mode (str): One of {triangular, triangular2, exp_range}.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
            Default: 'triangular'
        gamma (float): Constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
            Default: 1.0
        scale_fn (function): Custom scaling policy defined by a single
            argument lambda function, where
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode parameter is ignored
            Default: None
        scale_mode (str): {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on
            cycle number or cycle iterations (training
            iterations since start of cycle).
            Default: 'cycle'
        last_batch_idx (int): The index of the last batch. Default: -1
    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> scheduler = torch.optim.CyclicLR(optimizer)
        >>> data_loader = torch.utils.data.DataLoader(...)
        >>> for epoch in range(10):
        >>>     for batch in data_loader:
        >>>         scheduler.step()
        >>>         train_batch(...)
    .. _Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
    .. _bckenstler/CLR: https://github.com/bckenstler/CLR
    """

    def __init__(self,
                 optimizer,
                 base_lr=1e-3,
                 max_lr=6e-3,
                 step_size_up=2000,
                 step_size_down=None,
                 mode='triangular',
                 gamma=1.,
                 scale_fn=None,
                 scale_mode='cycle',
                 last_batch_idx=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        base_lrs = self._format_lr('base_lr', optimizer, base_lr)
        if last_batch_idx == -1:
            for base_lr, group in zip(base_lrs, optimizer.param_groups):
                group['lr'] = base_lr

        self.max_lrs = self._format_lr('max_lr', optimizer, max_lr)

        step_size_down = step_size_down or step_size_up
        self.total_size = float(step_size_up + step_size_down)
        self.step_ratio = float(step_size_up) / self.total_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        super(CyclicLR, self).__init__(optimizer, last_batch_idx)

    def _format_lr(self, name, optimizer, lr):
        """Return correctly formatted lr for each param group."""
        if isinstance(lr, (list, tuple)):
            if len(lr) != len(optimizer.param_groups):
                raise ValueError("expected {} values for {}, got {}".format(
                    len(optimizer.param_groups), name, len(lr)))
            return torch.tensor(lr)
        else:
            return lr * torch.ones(len(optimizer.param_groups))

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        """Calculates the learning rate at batch index. This function treats
        `self.last_epoch` as the last batch index.
        """
        cycle = math.floor(1 + self.last_epoch / self.total_size)
        x = 1 + self.last_epoch / self.total_size - cycle
        if x <= self.step_ratio:
            scale_factor = x / self.step_ratio
        else:
            scale_factor = (x - 1) / (self.step_ratio - 1)

        lrs = []
        for base_lr, max_lr in zip(self.base_lrs, self.max_lrs):
            base_height = (max_lr - base_lr) * scale_factor
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_epoch)
            lrs.append(lr)
        return lrs

In [None]:
quora_data = pd.read_csv('../input/train.csv')

In [None]:
quora_test_data = pd.read_csv('../input/test.csv')

In [None]:
sample = quora_data.sample(100)

In [None]:
for s in sample[sample.target == 1].question_text:
    print(s)

In [None]:
from nltk.tokenize import TweetTokenizer

In [None]:
print(nltk.tokenize.word_tokenize("Don't spoil the movie or I'll kill you"))

In [None]:
print(TweetTokenizer().tokenize("Don't spoil the movie or I'll kill you"))

In [None]:
def tokenize(questions):
    tokenized_questions = []
    for iteration, text in enumerate(questions):
        if iteration % 50000 == 0:
            print(iteration, "texts tokenized")
        tokenized_questions.append([t.lower() for t in TweetTokenizer().tokenize(text)])
    return tokenized_questions

In [None]:
dev_tokens = tokenize(quora_data.question_text)

In [None]:
test_tokens = tokenize(quora_test_data.question_text)

In [None]:
import torchtext

In [None]:
from collections import Counter
import itertools

In [None]:
print(dev_tokens[0])

In [None]:
train_tokens, val_tokens, train_labels, val_labels = train_test_split(dev_tokens, quora_data.target, test_size=0.1)

In [None]:
word_counts = Counter(itertools.chain(*train_tokens))

In [None]:
print(len(word_counts))
print(word_counts.most_common(10))

In [None]:
from gensim.models.keyedvectors import KeyedVectors

In [None]:
gensim_vectors = KeyedVectors.load_word2vec_format('../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
def filter_vectors(gensim_vectors, words):
    result = {}
    for w in words:
        if w in gensim_vectors.vocab:
            result[w] = gensim_vectors[w].copy()
    return result

In [None]:
filtered_vectors = filter_vectors(gensim_vectors, word_counts.keys())

In [None]:
print(len(filtered_vectors))

In [None]:
del gensim_vectors

In [None]:
min_occurences = 14
filtered_counts = {w:c for w,c in word_counts.items() if c >= min_occurences}
print(len(filtered_counts))

In [None]:
class VocabLike:
    def __init__(self, itos, stoi):
        self.itos = itos
        self.stoi = stoi

In [None]:
from collections import defaultdict

In [None]:
specials = ['<unk>', '<pad>', '<eos>']
filtered_counts.update({w:0 for w in specials})
# vocab = torchtext.vocab.Vocab(filtered_counts,specials=specials, max_size=30000)
stoi = defaultdict(lambda:0) #map to unk
itos = [0] * len(filtered_counts)

trainable_words = {w for w in filtered_counts.keys() if w not in specials and w not in filtered_vectors}
pretrained_words = {w for w in filtered_counts.keys() if w not in specials and w in filtered_vectors}

for i,w in enumerate(itertools.chain(specials, trainable_words, pretrained_words)):
    stoi[w] = i
    itos[i] = w
    
vocab = VocabLike(itos, stoi)

In [None]:
print(len(pretrained_words))

In [None]:
def extract_vectors(vocab, vec_dict, offset, total, vec_size):
    vectors = np.zeros((total, vec_size), dtype=np.float32)
    for i in range(total):
        word = vocab.itos[i + offset]
        assert word in vec_dict
        vectors[i] = vec_dict[word]
    return vectors

In [None]:
np_vectors = extract_vectors(vocab, filtered_vectors,
                             len(specials) + len(trainable_words), 
                             len(pretrained_words),
                             300)

In [None]:
from sklearn.metrics import pairwise
def nearest_neighbors(vocab, embeddings, word, topn, use_offset=False):
    offset = len(specials) + len(trainable_words) if use_offset else 0
    assert word in vocab.stoi
    word_index = vocab.stoi[word] - offset
    sims = pairwise.cosine_similarity(embeddings[word_index].reshape(1,-1),embeddings).ravel()
    indices = np.argsort(sims)[::-1]
    print(word)
    for i in range(topn):
        sim_word = vocab.itos[indices[i] + offset]
        print(sim_word, sims[indices[i]])

In [None]:
nearest_neighbors(vocab, np_vectors, 'we\'ll', 10, True)

In [None]:
print(np_vectors.shape)

In [None]:
import gc
gc.collect()

In [None]:
def to_word_indices(tokens, vocab, start_index, end_index):
    return [[start_index] + [vocab.stoi[w] for w in sent] + [end_index] for sent in tokens]

In [None]:
class TokenToIdDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, vocab, max_size=-1, min_size=10, precompute=False, precomputed=False): #TODO should I precompute word indices?
        self._start_index = vocab.stoi['<sos>']
        self._end_index = vocab.stoi['<eos>']
        self._pad_index = vocab.stoi['<pad>']
        
        if precompute and not precomputed:
            tokens = to_word_indices(tokens, vocab, self._start_index, self._end_index)
            
        self._precomputed = precompute or precomputed
        self._tokens = tokens
        self._labels = labels
        print(len(self._labels))
        self._vocab = vocab
        self._len = len(tokens)
        self._max_size = max_size
        self._min_size = min_size
        self._cache = []
        self._batch_size = -1
        
        
    def __len__(self):
        return self._len
    
    def compute_cache(self, batch_size):
        start = 0
        batch_num = 0
        cache = []
        while start < len(self):
            batch_end = min(start + batch_size, len(self))
            cache.append(self.collate([self[j] for j in range(start, batch_end)]))
            start = batch_end
        self._cache = cache
        self._batch_size = batch_size
            

    def get_cache(self, batch_size):
        if not self._cache or self._batch_size != batch_size:
            self.compute_cache(batch_size)
        return self._cache
    
    def collate(self, samples):
        if self._max_size == -1:
            size = max(self._min_size, max(len(s[0]) for s in samples))
        else:
            size = self._max_size
            
        labels_tensor = torch.tensor([s[1] for s in samples], dtype=torch.float32)
        texts_tensor = torch.zeros(len(samples), size, dtype=torch.long)
        texts_tensor += self._pad_index
        for i, pair in enumerate(samples):
            text, _ = pair
            text = text if len(text) <= size else text[:size]
            texts_tensor[i, size - len(text):] = torch.tensor(text)
        
        return texts_tensor, labels_tensor
        
        
    
    def __getitem__(self, index):
        text = self._tokens[index]
        if self._precomputed:
            indices = text
        else:
            indices = [self._start_index]
            indices.extend(self._vocab.stoi[tok] for tok in text)
            indices.append(self._end_index)
        return indices, self._labels[index]
            

In [None]:
min_length=10
train_dataset = TokenToIdDataset(train_tokens,train_labels.values, vocab,min_size=min_length, precompute=True)
val_dataset = TokenToIdDataset(val_tokens, val_labels.values, vocab,min_size=min_length, precompute=True)

In [None]:
import gc
gc.collect()

In [None]:
import tqdm
from tqdm import tqdm_notebook

In [None]:
class BestModel:
    def __init__(self, model_path, optimizer_path, best_loss=10000):
        self.best_loss = best_loss
        self.model_path = model_path
        self.optimizer_path = optimizer_path
        
    def update(self, loss, model, optimizer=None):
        self.best_loss = loss
        torch.save(model.state_dict(), self.model_path)
        if optimizer:
            torch.save(optimizer.state_dict(), self.optimizer_path)
        
    def load(self, model, optimizer=None):
        model.load_state_dict(torch.load(self.model_path))
        if optimizer:
            optimizer.load_state_dict(torch.load(self.optimizer_path))
        model.eval()
        
    def copy(self, new_model_path, new_optimizer_path):
        from shutil import copyfile
        copyfile(self.model_path, new_model_path)
        copyfile(self.optimizer_path, new_optimizer_path)
        return BestModel(new_model_path, new_optimizer_path, self.best_loss)

In [None]:
bat = torch.tensor([
     [[1,2,3],[4,5,6]],
     [[11,12,13],[14,15,16]],
     [[5,9,2],[11,3,9]],
     [[-1, 2,10],[10,4,5]]
])
print(bat.shape) # 4 2 3 
print(torch.matmul(bat,torch.transpose(bat,1,2)))

In [None]:
def train_network(network, optimizer,
                  criterion,
                  train_loader, val_loader,
                  n_epochs, patience,
                  best_model, after_gradient=None, lr_scheduler=None, schedule_per_batch=False):
    attempts_left = patience
    for epoch in range(n_epochs):
         # Training mode
        network.train()
        if lr_scheduler and not schedule_per_batch:
            lr_scheduler.step()
        print_every = len(train_loader) // 10
        running_train_loss = 0.0
        batches_since_last_print = 0
        for i, batch in tqdm_notebook(enumerate(train_loader), total=len(train_loader)):
            if lr_scheduler and schedule_per_batch:
                lr_scheduler.step()
            X, y = batch
            X, y = X.cuda(), y.cuda()
        
            optimizer.zero_grad()
            
            output = network(X)
            loss = criterion(output, y.view(-1,1))
            
            running_train_loss += loss.item()
            batches_since_last_print += 1
            
            # for g in optimizer.param_groups:
                #     g['lr'] = 0.0001
            if i % print_every == print_every - 1:
                learning_rate = next(iter(optimizer.param_groups))['lr']
                print("Epoch {}, batch {}, loss {}, lr {}".format(epoch + 1,
                                                                  i + 1, 
                                                                  running_train_loss / batches_since_last_print,
                                                                 learning_rate))
                running_train_loss = 0.0
                batches_since_last_print = 0
            
            loss.backward()
            
            if after_gradient:
                after_gradient(epoch, network)
#             network.zero_embedding_grad()
            
            optimizer.step()
            
        
        network.eval()
        with torch.no_grad():
            running_val_loss = 0.0
            for i, batch in tqdm_notebook(enumerate(val_loader),total=len(val_loader)):
                X, y = batch
                X, y = X.cuda(), y.cuda()
                
                output = network(X)
                loss = criterion(output, y.view(-1,1))
                
                running_val_loss += loss.item()
                
            running_val_loss /= len(val_loader)
            print("Epoch {}, validation loss {}".format(epoch + 1, running_val_loss))
            
            if running_val_loss < best_model.best_loss:
                print('Improved from {} to {}'.format(best_model.best_loss, running_val_loss))
                best_model.update(running_val_loss, network, optimizer)
                attempts_left = patience
            elif attempts_left > 0:
                print('No improvement, attempts left = {}'.format(attempts_left))
                attempts_left -= 1
            else:
                print('Early stopping, best_weight are saved')
#                 best_model.load(network, optimizer)
                break
        
#         if attempts_left != patience:
#             best_model.load(network, optimizer)
            

In [None]:
class GenericAttention(nn.Module):
    def __init__(self, scoring_function, query_transform, key_transform, value_transform):
        super().__init__()
        self.scoring_function = scoring_function
        self.query_transform = query_transform
        self.key_transform = key_transform
        self.value_transform = value_transform
        
    def forward(self, query, keys, values):
        query, keys, values = self.query_transform(query), self.key_transform(keys), self.value_transform(values)
        batch_size, output_length, query_dim = query.size()
        _, keys_length, keys_dim = keys.size()
        
        # B Nq Dq ^ B Nk Dk = B, Nq, Nk
        attention_scores = self.scoring_function(query, keys) 
        
        at_b, at_q, at_k = attention_scores.size()
        
        assert at_b == batch_size
        assert at_q == output_length
        assert at_q == keys_length
        
        attention_weights = F.softmax(attention_scores, dim=2)
        
        # B Nq, Nk X B Nk Dv = B Nq Dv 
        alignments = torch.bmm(attention_weights, values) 
        
        return alignments, attention_weights
        

In [None]:
class DotProductAttentionScoring(nn.Module):
    def __init__(self, scale):
        super().__init__()
        self.scale = np.sqrt(scale)
        
    def forward(self, query, keys):
        # B Nq Dq ^ B Nk Dk = B, Nq, Nk
        b_q, n_q, d_q = query.size()
        b_k, n_k, d_k = keys.size()
        
        assert b_q == b_k
        
        dot_products = torch.bmm(query, torch.transpose(keys, 1, 2)) / self.scale
        
        return dot_products

In [None]:
class TimeInvariant(nn.Module):
    def __init__(self, inner):
        super().__init__()
        self.inner = inner
        
    def forward(self, data):
        batch,time,dim = data.size()
        result = self.inner(data.view(batch * time, dim))
        result = result.view(batch, time, -1)
        return result
    

In [None]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model,dtype=torch.float32)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) # pos, dim
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # 1, pos, dim
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],  #x B,N,D
                         requires_grad=False)
        return x
    

In [None]:
class SelfAttentionNet(nn.Module):
    
    def __init__(self, vocab, embeddings, num_trainable):
        super().__init__()
        self.pos_encoding = PositionalEncoding(50,120)
        self.num_trainable = num_trainable
        self.embedding = nn.Embedding(num_embeddings=len(vocab.itos), embedding_dim=300,padding_idx=vocab.stoi['<pad>'])
        self.embedding.weight.data[num_trainable:].copy_(torch.from_numpy(embeddings))
#         self.drop_emb = nn.AlphaDropout(p=0.5)
        self.drop_emb = nn.Dropout2d(p=0.5)
        self.pretrained_projection = nn.Sequential(
            nn.Linear(300,300))
#             nn.ReLU(),
#             nn.Linear(300,300))


#         mlp_transform = TimeInvariant(
#             nn.Sequential(
#                 nn.Linear(300, 500),
# #                 nn.BatchNorm1d(500),
#                 nn.ReLU(),
#                 nn.Linear(500, 500)
#             )
#         )
        
        key_transform = TimeInvariant(
            nn.Sequential(
                nn.Linear(350,300),
#                 nn.BatchNorm1d(300),
                nn.ReLU(),
                nn.Linear(300,300)
            ))
        query_transform = TimeInvariant(
            nn.Sequential(
                nn.Linear(350,300),
#                 nn.BatchNorm1d(300),
                nn.ReLU(),
                nn.Linear(300,300)
            ))
        value_transform = TimeInvariant(
            nn.Sequential(
                nn.Linear(350,300),
#                 nn.BatchNorm1d(300),
                nn.ReLU(),
                nn.Linear(300,300)
            ))
        # Наложить маску на padding?
        attention_scoring = DotProductAttentionScoring(300.) # Проверить 
        self.attention1 = GenericAttention(attention_scoring, query_transform, key_transform,value_transform)
        
                
        self.fc = nn.Sequential(
            nn.Dropout(p=0.2),
#             nn.BatchNorm1d(300),
            nn.Linear(300,500),
            nn.BatchNorm1d(500),
            nn.ReLU(),
            nn.Linear(500, 1)
        )
        
    
    def zero_embedding_grad(self):
        self.embedding.weight.grad[self.num_trainable:] = 0
        
    def set_emb_dropout(self, p):
        self.emb_dropout = p
        
    def apply_spatial_dropout(self,dropout_layer, x):
        x = torch.transpose(x,1,2) # B C N
        x = x.view(x.shape[0], x.shape[1], x.shape[2], 1) # B C N 1
        x = dropout_layer(x)
        x = torch.squeeze(x,dim=3)
        
        x = torch.transpose(x,1,2) # B N C
        return x
    
    def get_attention_weights(self, x):
        x = self.embed(x)
        x,weights = self.attention1(x, x, x) # B N C
        return weights
        
    def mixed_embedding(self, x):
        # x = B N
        e = self.embedding(x)
        # B N
        pretrained_mask = x < self.num_trainable
        pretrained = e.view(-1,300)[pretrained_mask.view(-1)]
        e.view(-1,300)[pretrained_mask.view(-1)] = self.pretrained_projection(pretrained)
        return e
        
    def embed(self, x):
        x = self.mixed_embedding(x) # B, N, C
        x = self.apply_spatial_dropout(self.drop_emb, x)
        
        pe = torch.zeros(x.size(0), x.size(1), 50,device=x.device)
        pe = self.pos_encoding(pe)
        x = torch.cat((x,pe), dim=2)
        return x


#         x = torch.cat((x,pe), dim=2)
#         x = self.pos_encoding(x)
#         x = F.dropout(x, p=self.emb_dropout,training=self.training)

    def forward(self, x):
#         print('x', x.size())
        x = self.embed(x) # B, N, C

        x,_ = self.attention1(x, x, x) # B N C
#         x = self.drop_attention1(x)
#         x = self.attention_result_transform(x)        
#         x = self.attention2(x, x, x)

        reduced,_ = torch.max(x, dim=1)
        return self.fc(reduced)
        
        
        

In [None]:
class Net(nn.Module):
    def __init__(self, vocab, embeddings, num_trainable, normalize=False):
        super().__init__()
#         self.pretrained_embedding = nn.Embedding(num_embeddings=len(vocab.itos) - num_trainable, embedding_dim=300)
#         # Рандомная инициализвция?
#         self.pretrained_embedding.weight.requires_grad = False
#         self.pretrained_embedding.weight.data.copy_(torch.from_numpy(embeddings[num_trainable:]))
        self.num_trainable = num_trainable

        self.embedding = nn.Embedding(num_embeddings=len(vocab.itos), embedding_dim=300,padding_idx=vocab.stoi['<pad>'])
        if normalize:
            embeddings = embeddings / np.linalg.norm(embeddings,axis=1).reshape(-1,1)
        self.embedding.weight.data[num_trainable:].copy_(torch.from_numpy(embeddings))
        self.drop_emb = nn.Dropout2d(p=0.5)

        
        self.rnn1 = nn.GRU(input_size=300, hidden_size=150,num_layers=1,bidirectional=True)
        self.drop_rnn = nn.Dropout(p=0.2)
        self.rnn2 = nn.GRU(input_size=300, hidden_size=150,num_layers=1,bidirectional=True)
        self.fc = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(300, 1)
        )
        
    def embed(self, x):
        return self.embedding(x)
    
    def zero_embedding_grad(self):
        self.embedding.weight.grad[self.num_trainable:] = 0
    
    def get_hidden(self, x):
        x = self.embedding(x) # B, N, C
        x = torch.transpose(x,1,2) # B C N
        x = x.view(x.shape[0], x.shape[1], x.shape[2], 1) # B C N 1
        x = self.drop_emb(x)
        x = torch.squeeze(x,dim=3)
        
        
        x = torch.transpose(x,1,2) # N B C
        x = torch.transpose(x,0,1)

        x, hidden = self.rnn1(x)
        return x
    
    
    def forward(self, x):
        x = self.embedding(x) # B, N, C
        x = torch.transpose(x,1,2) # B C N
        x = x.view(x.shape[0], x.shape[1], x.shape[2], 1) # B C N 1
        x = self.drop_emb(x)
        x = torch.squeeze(x,dim=3)
        

        x = torch.transpose(x,1,2) # N B C
        x = torch.transpose(x,0,1)
#         x = self.conv_block(x)
        
#         x = F.max_pool1d(x,kernel_size=x.shape[-1])
#         x = x.view(x.shape[0],-1)
        output, hidden = self.rnn1(x) #output T, B, dim * d
        output = self.drop_rnn(output)
        output = output + self.rnn2(output)[0]
        output = output.transpose(0,1) # B N C
        x,_ = torch.max(output, dim=1)
    
#         x = torch.cat((output[-1, :, :self.rnn1.hidden_size],output[0, :, self.rnn1.hidden_size:]), dim=1)
        
#         hidden = hidden.view(self.rnn1.num_layers,2,-1,self.rnn1.hidden_size)
        
#         fc_input = hidden[-1] #2 B H
#         fc_input = torch.transpose(fc_input,0,1) #B 2 H
#         fc_input = fc_input.reshape(-1, 2 * self.rnn1.hidden_size)
        
        x = self.fc(x)
        return x

In [None]:
num_scratch = len(specials) + len(trainable_words)

In [None]:
def test_shape_1():
    net = Net(vocab, np_vectors, num_scratch).cuda()
    net.eval()
    summary(net,(min_length,),input_type=torch.long)
    
def test_shape():
    net = SelfAttentionNet(vocab, np_vectors, num_scratch).cuda()
    net.eval()
    summary(net,(min_length,),batch_size=32, input_type=torch.long)
    
def test_shape2():
    net = SelfAttentionNet(vocab, np_vectors, num_scratch).cuda()
    
    net.mixed_embedding()
    summary(net,(min_length,),batch_size=32, input_type=torch.long)

test_shape_1()

In [None]:
net = Net(vocab, np_vectors, num_scratch).cuda()
best_model = BestModel('best_model', 'best_optimizer')

In [None]:
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.])).cuda()
optimizer = torch.optim.Adam(net.parameters(), lr=0.0003)

In [None]:
# scheduler = CyclicLR(optimizer,base_lr=0.0001,max_lr=0.006,step_size_up=9184)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=6,gamma=0.2)

In [None]:
def truncate_batch(batch):
#     print(len(batch))
    x,y = batch
    if x.shape[1] > 100:
        x = x[:,:100]
    return x,y

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=256,
                                           collate_fn=lambda samples: truncate_batch(train_dataset.collate(samples)), shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=256,collate_fn=lambda samples: truncate_batch(val_dataset.collate(samples)))
# train_loader = train_dataset.get_cache(256)
# val_loader = val_dataset.get_cache(256)

In [None]:
print(128 * 112 * 112 * 300 * 4 / 1024 / 1024)

In [None]:
train_network(net, optimizer,criterion,train_loader,val_loader,16, 3,best_model, # lr_scheduler=scheduler,
              after_gradient=lambda epoch, network: network.zero_embedding_grad())
best_model.load(net, optimizer)

# train_network(net, optimizer,criterion,train_loader,val_loader,16, 4,best_model,lr_scheduler=scheduler)

In [None]:
train_network(net, optimizer,criterion,train_loader,val_loader,10, 3,best_model, # lr_scheduler=scheduler,
              after_gradient=lambda epoch, network: network.zero_embedding_grad())
best_model.load(net, optimizer)


In [None]:
best_model.load(net, optimizer)

In [None]:
best_model_with_fixed_embeddings = best_model.copy('best_model_fixed', 'best_optimizer_fixed')

In [None]:
best_model_with_fixed_embeddings.load(net, optimizer)

In [None]:
best_model_with_fixed_embeddings.load(net, optimizer)
for g in optimizer.param_groups:
    g['lr'] = 0.00001
train_network(net, optimizer,criterion,train_loader,val_loader,10, 3,best_model_with_fixed_embeddings, # lr_scheduler=scheduler,
              after_gradient=lambda epoch, network: network.zero_embedding_grad())
best_model_with_fixed_embeddings.load(net, optimizer)

In [None]:
best_model2 = best_model_with_fixed_embeddings.copy('best_model_low', 'best_optimizer_low')
best_model2.load(net, optimizer)
for g in optimizer.param_groups:
    g['lr'] = 0.00003
train_network(net, optimizer,criterion,train_loader,val_loader,10, 3,best_model2)
best_model2.load(net, optimizer)

In [None]:
best_model2.load(net, optimizer)

In [None]:
# # net.set_emb_dropout(0.5)
train_network(net, optimizer,criterion,train_loader,val_loader,10, 3,best_model)
best_model.load(net, optimizer)

In [None]:
# for g in optimizer.param_groups:
#     g['lr'] = 0.0001

In [None]:
best_model.load(net, optimizer)

In [None]:
# for g in optimizer.param_groups:
#     g['lr'] = 0.00003

In [None]:
# train_network(net, optimizer,criterion,train_loader,val_loader,16, 3,best_model)

In [None]:
def predict_loader(network, loader):
    network.eval()
    all_predictions = []
    with torch.no_grad():
        for X,_ in tqdm_notebook(loader, total=len(loader)):
            X = X.cuda()
            out = network(X).view(-1)
            out = torch.sigmoid(out)
            all_predictions.extend(out.tolist())

    return np.array(all_predictions)

In [None]:
network_pred = predict_loader(net, val_loader)

In [None]:
print(net.embedding.weight.requires_grad)

In [None]:
prc = metrics.precision_recall_curve(val_labels.values, network_pred, pos_label=1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.xlabel('precision')
plt.ylabel('recall')
plt.plot(prc[0], prc[1])
plt.show()

In [None]:
exact_predictions = np.round(network_pred)
print(metrics.classification_report(val_labels.values, exact_predictions))

In [None]:
def try_different_thresholds(y_true, network_pred):
    thresholds = np.arange(0,1,0.01)
    scores = []
    for t in thresholds:
        exact_pred = (network_pred > t).astype(np.int8)
        f1_score = metrics.f1_score(y_true, exact_pred)
#         print(t, f1_score)
        scores.append(f1_score)
    return thresholds, np.array(scores)

In [None]:
thresholds, f1_scores = try_different_thresholds(val_labels.values, network_pred)

Результат со spatial dropout

In [None]:
max_f1_index = np.argmax(f1_scores)
best_threshold = thresholds[max_f1_index]
print(best_threshold)
print(f1_scores[max_f1_index])

In [None]:
def thresholded_predictions(probs, threshold):
    return (probs > threshold).astype(np.int8)

In [None]:
exact_predictions = thresholded_predictions(network_pred, best_threshold)
print(metrics.classification_report(val_labels.values, exact_predictions))

In [None]:
print(metrics.confusion_matrix(val_labels.values, exact_predictions))

In [None]:
def test_weights(network):
    network.eval()
    with torch.no_grad():
        xx, yy = next(iter(val_loader))
        xx, yy = xx.cuda(), yy.cuda()
        print(xx[10], [vocab.itos[w] for w in xx[10]])
        weights = network.get_attention_weights(xx)
        print(weights.size())
        return weights.cpu()
    


In [None]:
def show_attention(sent_index, attn, sentences):
    attn = attn.numpy()
    attn_matrix = attn[sent_index]
    sentence = sentences[sent_index]
    sentence = [w if w in filtered_counts else w + '(unk)' for w in sentence]
    sentence = sentence + ['</S>']
    limit = len(sentence)
    ticks = np.arange(0, len(sentence))
    fig = plt.figure(figsize=(18,18))
    ax = fig.add_subplot(111)
    limm = attn_matrix[-limit:,-limit:]
    print(limm[0])
    cax = ax.matshow(limm,
                     vmin=limm.min(),
                     vmax=limm.max(), 
                     cmap=plt.cm.Greys)
    fig.colorbar(cax)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    ax.set_xticklabels(sentence)
    ax.set_yticklabels(sentence)
    plt.show()

    
    

In [None]:
print(val_tokens[9])

In [None]:
attn = test_weights(net)
show_attention(15,attn,val_tokens[:256])

In [None]:
def print_examples(questions, labels, y_pred):
    assert len(questions) == len(labels) == len(y_pred)
    sample = np.random.choice(len(questions),size=1000, replace=False)
    questions, labels, y_pred = questions[sample], labels[sample], y_pred[sample]
    
    positive_pred = y_pred == 1
    negative_pred = ~positive_pred
    positives = labels == 1
    negatives = ~positives
    
    true_pos = questions[positive_pred & positives][:10]
    false_pos = questions[positive_pred & negatives][:10]
    true_neg = questions[negative_pred & negatives][:10]
    false_neg = questions[negative_pred & positives][:10]
    for name, examples in zip(('TP', 'FP', 'TN', 'FN'),(true_pos, false_pos, true_neg, false_neg)):
        print(name,":")
        for q in examples:
            print("\t", q)
        print()
    
    

In [None]:
val_questions = quora_data.iloc[list(val_labels.index)].question_text.values

In [None]:
print_examples(val_questions, val_labels.values, exact_predictions)

In [None]:
print(net.embedding.weight.data)

In [None]:
tuned_embeddings = net.embedding.weight.data.cpu().numpy()

In [None]:
print(tuned_embeddings.shape)

In [None]:
from sklearn import decomposition

In [None]:
def get_trajectory(batch):
    X = batch.cuda()
    with torch.no_grad():
        states = net.get_hidden(X)
        probs = F.sigmoid(net(X)).view(-1).cpu().numpy()
    print(states.shape)
    pca = decomposition.PCA(2)
    timesteps = states.shape[0]
    batch_size = states.shape[1]
    states = torch.transpose(states,0,1) #example, t, h
    pca_points = pca.fit_transform(states.cpu().reshape(-1,192).numpy())
    print(pca.explained_variance_)
    return pca_points.reshape(batch_size, timesteps, 2), probs

In [None]:
def display(pca_points, probs, tokens, grid: tuple, offset=0):
    total = np.prod(grid)
    fig = plt.figure(figsize=tuple(s * 8 for s in grid))
    xmin, ymin = pca_points.reshape(-1,2).min(axis=0)
    xmax, ymax = pca_points.reshape(-1,2).max(axis=0)
    pca_points = pca_points[offset:offset + total]
    tokens = tokens[offset:offset + total]
    probs = probs[offset:offset + total]
    
    for i in range(total):
        ax = fig.add_subplot(*(grid + (i+1,)))
#         ax.scatter(pca_points[i][:,0], pca_points[i][:,1])
        print(pca_points[i,0,:])
        x = pca_points[i,:,0]
        y = pca_points[i,:,1]
#         ax.plot(, pca_points[i][:,1],marker='o')
#         ax.scatter(x,y)
        classified = probs[i] > best_threshold
        ax.quiver(x[:-1], y[:-1], x[1:]-x[:-1], y[1:]-y[:-1], scale_units='xy', angles='xy', scale=1, color=('indianred' if classified else 'black'))
        ax.set_xlim(xmin-0.5,xmax+1.5)
        ax.set_ylim(ymin-0.5,ymax+0.5)
        xtext = xmax + 2
        ytext = ymax - 0.5
        color_seq = ['red', 'green', 'blue', 'gray', 'darkviolet', 'olive' ]
        color_cycle = iter(itertools.cycle(color_seq))
        sx, sy = x[-len(tokens[i])-1:], y[-len(tokens[i])-1:]
        for j, token in enumerate(tokens[i]):
            xx, yy = sx[j],sy[j]
            c = next(color_cycle)
#             ax.annotate(token, (xx,yy), (xtext,ytext), color=c, arrowprops={'arrowstyle': '-', 'color': c})
            ax.annotate(str(j + 1) + '.' + token, (xx,yy), (xtext,ytext), color=c)
            ax.annotate(str(j + 1), (xx,yy), color=c)
            ytext -= 0.5
            
    plt.show()

In [None]:
batch, _ = next(iter(val_loader))
pca, probs = get_trajectory(batch)
toks = val_tokens[:len(batch)]


In [None]:
import matplotlib.colors

In [None]:
cdict = {'red':  ((0.0, 0.0, 0.0),
                 (1/6., 0.0, 0.0),
                 (1/2., 0.8, 1.0),
                 (5/6., 1.0, 1.0),
                 (1.0, 0.4, 1.0)),

             'green':  ((0.0, 0.0, 0.4),
                 (1/6., 1.0, 1.0),
                 (1/2., 1.0, 0.8),
                 (5/6., 0.0, 0.0),
                 (1.0, 0.0, 0.0)),

             'blue': ((0.0, 0.0, 0.0),
                 (1/6., 0.0, 0.0),
                 (1/2., 0.9, 0.9),
                 (5/6., 0.0, 0.0),
                 (1.0, 0.0, 0.0))

    }

cmap=matplotlib.colors.LinearSegmentedColormap('rg',cdict, N=256)

In [None]:
plt.scatter(pca[:,-1,0],pca[:,-1,1],c=probs, cmap=cmap)
plt.show()

In [None]:
display(pca, probs, toks, (2,2), offset=132)

In [None]:
sel_word = 'iq'
nearest_neighbors(vocab, np_vectors, sel_word, 10, True)
print('\nafter_training\n')
nearest_neighbors(vocab, tuned_embeddings, sel_word, 10, False)

In [None]:
def ngram_neighbors(conv_layer, vocab, vectors, idx):
    ngram = conv_layer.weight.data[idx].cpu().numpy()
    ngram = ngram.T
    sims = pairwise.cosine_similarity(ngram, vectors)
    ranking = np.argsort(sims,axis=1)[:,::-1]
    for i, row in enumerate(ranking):
        print(i)
        for j in range(10):
            word_index = ranking[i,j]
            sim = sims[i, word_index]
            print(j+1, vocab.itos[word_index], sim)
    

In [None]:
# ngram_neighbors(net.conv_block_width3[0], vocab,tuned_embeddings,0)
# ngram_neighbors(net.conv_block_width3[0], vocab,tuned_embeddings,200)

In [None]:
test_dataset = TokenToIdDataset(test_tokens,np.broadcast_to(np.zeros(1),shape=(len(test_tokens,))),vocab,min_size=25, precompute=True)

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=256,collate_fn=test_dataset.collate)

In [None]:
network_test_pred = predict_loader(net, test_loader)

In [None]:
exact_test_predictions = thresholded_predictions(network_test_pred, best_threshold) #Use different thresholds

In [None]:
submission = pd.DataFrame({'qid': quora_test_data.qid, 'prediction': exact_test_predictions})

In [None]:
submission.head(5)

In [None]:
print(quora_test_data.head(5))

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
with open('submission.csv') as f:
    for line in itertools.islice(f,0,5):
        print(line)

In [None]:
exact_test_predictions = thresholded_predictions(network_test_pred, 0.5)
submission = pd.DataFrame({'qid': quora_test_data.qid, 'prediction': exact_test_predictions})
submission.to_csv('submission_05.csv', index=False)