In [1]:
import time
import random
import os
from IPython.display import display
import numpy as np
import pandas as pd
from scipy import stats
import warnings

import torch
import torch.nn as nn
import torch.utils.data

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore", message="F-score is ill-defined and being set to 0.0 due to no predicted samples.")
%matplotlib inline

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print('Train data dimension: ', train_df.shape)
print('Test data dimension: ', test_df.shape)

Using TensorFlow backend.


Train data dimension:  (1306122, 3)
Test data dimension:  (56370, 2)


In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)], disable=True):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
embed_size = 300
max_features = 95000
maxlen = 70

In [4]:
import re

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [5]:
train_df_a = train_df.copy()
test_df_a = test_df.copy()

In [6]:
%%time
for df in [train_df, test_df]:
    df["question_text"].fillna("_##_", inplace=True)
    df["question_text"] = df["question_text"].apply(lambda x: x.lower())
    df["question_text"] = df["question_text"].apply(lambda x: clean_text(x))
    df["question_text"] = df["question_text"].apply(lambda x: clean_numbers(x))
    df["question_text"] = df["question_text"].apply(lambda x: replace_typical_misspell(x))
    
    
x_train = train_df["question_text"].values
x_test = test_df["question_text"].values

tokenizer = Tokenizer(num_words=max_features, filters='')
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

print ('Pre Padding...')
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

y_train = train_df['target'].values

Pre Padding...
CPU times: user 1min 48s, sys: 624 ms, total: 1min 48s
Wall time: 1min 48s


In [7]:
%%time
for df in [train_df_a, test_df_a]:
    df["question_text"].fillna("_##_", inplace=True)
    df["question_text"] = df["question_text"].apply(lambda x: x.lower())
    df["question_text"] = df["question_text"].apply(lambda x: replace_typical_misspell(x))
    df["question_text"] = df["question_text"].apply(lambda x: clean_text(x))
    df["question_text"] = df["question_text"].apply(lambda x: clean_numbers(x))
    
    
x_train_a = train_df_a["question_text"].values
x_test_a = test_df_a["question_text"].values

tokenizer_a = Tokenizer(num_words=max_features, filters='')
tokenizer_a.fit_on_texts(list(x_train_a))
x_train_a = tokenizer_a.texts_to_sequences(x_train_a)
x_test_a = tokenizer_a.texts_to_sequences(x_test_a)

print ('Pre Padding...')
x_train_a = pad_sequences(x_train_a, maxlen=maxlen)
x_test_a = pad_sequences(x_test_a, maxlen=maxlen)

Pre Padding...
CPU times: user 1min 49s, sys: 464 ms, total: 1min 49s
Wall time: 1min 49s


In [8]:
def load_glove(word_index, max_words=max_features, embed_size=embed_size):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    emb_mean, emb_std = -0.005838498938828707, 0.4878219664096832

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_para(word_index, max_words=max_features, embed_size=embed_size):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    emb_mean,emb_std = -0.005324783269315958, 0.4934646189212799

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8", errors='ignore') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_fasttext_fast(word_index, max_words=max_features, embed_size=embed_size):
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    emb_mean,emb_std = -0.0033469985, 0.109855495

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8", errors='ignore') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix
  
def load_fasttext_slow(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [9]:
seed_everything()
fasttext_embeddings = load_fasttext_slow(tokenizer_a.word_index)

np.shape(fasttext_embeddings)



(95000, 300)

In [10]:
from torch.autograd import Variable

class GaussianNoise(nn.Module):
    ## Made by yours truly: the Legend Soham
    def __init__(self, std=0.1, mean = 0):
            super().__init__()
            self.std = std
            self.mean = mean
            
    def forward(self, x):
        if not self.training: 
            return x
        else:
            noise = Variable(x.data.new(x.size()).normal_(self.mean, self.std).cuda())
            return x + noise
        
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [11]:
import torch as t
import torch.nn.functional as F

embedding_dim = embed_size
hidden_size = 60
gru_len = hidden_size

Routings = 4
Num_capsule = 5
Dim_capsule = 5
dropout_p = 0.25
rate_drop_dense = 0.28
LR = 0.001
T_epsilon = 1e-7
num_classes = 30

class Capsule(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(t.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                t.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))  

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = t.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)  # (batch_size,num_capsule,input_num_capsule,dim_capsule)
        b = t.zeros_like(u_hat_vecs[:, :, :, 0])  # (batch_size,num_capsule,input_num_capsule)

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(t.einsum('bij,bijk->bik', (c, u_hat_vecs)))  # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = t.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  # batch matrix multiplication
        return outputs  # (batch_size, num_capsule, dim_capsule)

    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = t.sqrt(s_squared_norm + T_epsilon)
        return x / scale

In [12]:
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, factor=0.6, min_lr=1e-4, mode='triangular2', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, torch.optim.Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration
        
        self.last_loss = np.inf
        self.min_lr = min_lr
        self.factor = factor
        
    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def step(self, loss):
        if loss > self.last_loss:
            self.base_lrs = [max(lr * self.factor, self.min_lr) for lr in self.base_lrs]
            self.max_lrs = [max(lr * self.factor, self.min_lr) for lr in self.max_lrs]
            
    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [13]:
def train_model_mean(model, x_train, y_train, validate=True):
    optimizer = torch.optim.Adam(model.parameters())

    step_size = 300
    scheduler = CyclicLR(optimizer, base_lr=0.001, max_lr=0.003,
                         step_size=step_size, mode='exp_range', gamma=0.99994)
    
    train = torch.utils.data.TensorDataset(x_train, y_train)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean').cuda()
    best_score = -np.inf
    
    for epoch in range(n_epochs):
        start_time = time.time()
        model.train()
        avg_loss = 0.
        
        for x_batch, y_batch in tqdm(train_loader, disable=True):
            y_pred = model(x_batch)
            scheduler.batch_step()
            
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        
    test_preds = np.zeros((len(test_loader.dataset)))
    
    for i, (x_batch,) in enumerate(test_loader):
        y_pred = model(x_batch).detach()
        test_preds[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    
    return test_preds


def train_model_sum(model, x_train, y_train, validate=True):
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum').cuda()
    
    step_size = 300
    base_lr, max_lr = 0.001, 0.003   
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                             lr=max_lr)
    
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
               step_size=step_size, mode='exp_range', gamma=0.99994)
    
    train = torch.utils.data.TensorDataset(x_train, y_train)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    
    for epoch in range(n_epochs):
        start_time = time.time()
        model.train()
        avg_loss = 0.
        
        for x_batch, y_batch in tqdm(train_loader, disable=True):
            y_pred = model([x_batch])
            scheduler.batch_step()
            
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        
    test_preds = np.zeros((len(test_loader.dataset)))
    
    for i, (x_batch,) in enumerate(test_loader):
        y_pred = model([x_batch]).detach()
        test_preds[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    
    return test_preds


def train_model_mean_a(model, x_train_a, y_train, validate=True):
    optimizer = torch.optim.Adam(model.parameters())

    step_size = 300
    scheduler = CyclicLR(optimizer, base_lr=0.001, max_lr=0.003,
                         step_size=step_size, mode='exp_range', gamma=0.99994)
    
    train = torch.utils.data.TensorDataset(x_train_a, y_train)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
  
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean').cuda()
    best_score = -np.inf
    
    for epoch in range(n_epochs):
        start_time = time.time()
        model.train()
        avg_loss = 0.
        
        for x_batch, y_batch in tqdm(train_loader, disable=True):
            y_pred = model(x_batch)
            scheduler.batch_step()
            
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        
    test_preds = np.zeros((len(test_loader_a.dataset)))
    
    for i, (x_batch,) in enumerate(test_loader_a):
        y_pred = model(x_batch).detach()
        test_preds[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    
    return test_preds

In [14]:
class NeuralNet1(nn.Module):
    def __init__(self):
        super(NeuralNet1, self).__init__()
        
        hidden_size = 128
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
        
        self.lstm_attention = Attention(hidden_size*2, maxlen)
        self.gru_attention = Attention(hidden_size*2, maxlen)
        
        self.linear = nn.Linear(1024, 16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        
        return out
    
class NeuralNet2(nn.Module):
    def __init__(self):
        super(NeuralNet2, self).__init__()
        
        fc_layer = 16
        fc_layer1 = 16

        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.GaussianNoise = GaussianNoise(std=0.1, mean=0)
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)
        self.caps_layer = Capsule()
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        
        self.linear = nn.Linear(hidden_size*8+1, fc_layer1)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(16, momentum=0.5)
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(fc_layer, 1)
        
    def forward(self, x):
        
        h_embedding = self.embedding(x[0])
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        h_embedding = self.GaussianNoise(h_embedding)
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        content3 = self.caps_layer(h_gru)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))

        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, content3, avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(conc)
        conc = self.dropout(conc)

        out = self.out(conc)
        
        return out

class NeuralNet3(nn.Module):
    def __init__(self):
        super(NeuralNet3, self).__init__()
        
        hidden_size = 60
        fc_layer1 = 16
        fc_layer = 16
        
        self.dropout3 = nn.Dropout(0.1)
        self.embedding_dropout3 = nn.Dropout2d(0.1)
        self.embedding3 = nn.Embedding(max_features, embed_size)
        self.embedding3.weight = nn.Parameter(torch.tensor(fasttext_embeddings, dtype=torch.float32))
        self.embedding3.weight.requires_grad = False
        self.lstm3 = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru3 = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
        self.relu3 = nn.ReLU()
        self.linear3 = nn.Linear(240, fc_layer1)
        self.out3 = nn.Linear(fc_layer, 1)
        
    def forward(self, x):
        h_embedding3 = self.embedding3(x)
        h_lstm3, _ = self.lstm3(h_embedding3)
        h_gru3, _ = self.gru3(h_lstm3)
        avg_pool3 = torch.mean(h_gru3, 1)
        max_pool3, _ = torch.max(h_gru3, 1)
        conc3 = torch.cat((avg_pool3, max_pool3), 1)
        conc3 = self.relu3(self.linear3(conc3))
        out = self.out3(conc3)
        
        return out

In [15]:
class NeuralNet4(nn.Module):
    def __init__(self):
        super(NeuralNet4, self).__init__()
        
        hidden_size = 60
        fc_layer1 = 16
        fc_layer = 16
        
        self.GaussianNoise = GaussianNoise(std=0.1,mean=0)
        
        self.dropout3 = nn.Dropout(0.1)
        self.embedding_dropout3 = nn.Dropout2d(0.1)
        self.embedding3 = nn.Embedding(max_features, embed_size)
        self.embedding3.weight = nn.Parameter(torch.tensor(fasttext_embeddings, dtype=torch.float32))
        self.embedding3.weight.requires_grad = False
        self.lstm3 = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru3 = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
        self.lstm_attention3 = Attention(hidden_size * 2, maxlen)
        self.relu3 = nn.ReLU()
        self.linear3 = nn.Linear(240, fc_layer1) #481-60, 801-100
        self.out3 = nn.Linear(fc_layer, 1)
        
    def forward(self, x):
        h_embedding3 = self.embedding3(x)
        h_lstm3, _ = self.lstm3(h_embedding3)
        h_gru3, _ = self.gru3(h_lstm3)
        avg_pool3 = torch.mean(h_gru3, 1)
        max_pool3, _ = torch.max(h_gru3, 1)
        conc3 = torch.cat((avg_pool3, max_pool3), 1)
        conc3 = self.relu3(self.linear3(conc3))
        conc3 = self.dropout3(conc3)
        out = self.out3(conc3)
        
        return out
    
class NeuralNet5(nn.Module):
    def __init__(self):
        super(NeuralNet5, self).__init__()
        
        hidden_size = 80
        fc_layer1 = 16
        fc_layer = 16
        
        self.embedding2 = nn.Embedding(max_features, embed_size)
        self.embedding2.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding2.weight.requires_grad = False
        self.GaussianNoise2 = GaussianNoise(std=0.1,mean=0)
        
        self.embedding_dropout2 = nn.Dropout2d(0.1)
        self.lstm2 = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru2 = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(16, momentum=0.5)
        self.dropout2 = nn.Dropout(0.1)
        self.linear2 = nn.Linear(640, fc_layer1) #643:80 - 483:60 - 323:40
        self.out2 = nn.Linear(fc_layer, 1)
    
    def forward(self, x):
        h_embedding2 = self.embedding2(x)
        h_embedding2 = torch.squeeze(self.embedding_dropout2(torch.unsqueeze(h_embedding2, 0)))
        h_embedding2 = self.GaussianNoise2(h_embedding2)
        ## basic RNN's
        h_lstm2, _ = self.lstm2(h_embedding2)
        h_gru2, _ = self.gru2(h_lstm2)
        
        avg_pool12 = torch.mean(h_lstm2, 1)
        max_pool12, _ = torch.max(h_lstm2, 1)
        avg_pool2 = torch.mean(h_gru2, 1)
        max_pool2, _ = torch.max(h_gru2, 1)
        
        conc2 = torch.cat((avg_pool12, max_pool12, avg_pool2, max_pool2), 1)
        conc2 = self.relu2(self.linear2(conc2))
        conc2 = self.dropout2(conc2)
        conc2 = self.GaussianNoise2(conc2)
        out = self.out2(conc2)
        
        return out

In [16]:
batch_size = 512

y_train = train_df['target'].values
X_train, X_test, X_train_a, X_test_a, y_train, y_test = train_test_split(x_train, x_train_a, y_train, stratify=y_train, 
                                                                         test_size=0.1, random_state=10)

print (X_train.shape)
print (X_train_a.shape)
print (X_test.shape)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

x_train_tensor = torch.tensor(X_train, dtype=torch.long).cuda()
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).cuda()
x_val_tensor = torch.tensor(X_test, dtype=torch.long).cuda()
y_val_tensor = torch.tensor(y_test, dtype=torch.float32).cuda()

x_train_a_tensor = torch.tensor(X_train_a, dtype=torch.long).cuda()
x_val_a_tensor = torch.tensor(X_test_a, dtype=torch.long).cuda()
   
train = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
valid = torch.utils.data.TensorDataset(x_val_tensor, y_val_tensor)
    
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

x_test_a_cuda = torch.tensor(x_test_a, dtype=torch.long).cuda()
test_a = torch.utils.data.TensorDataset(x_test_a_cuda)
test_loader_a = torch.utils.data.DataLoader(test_a, batch_size=batch_size, shuffle=False)

(1175509, 70)
(1175509, 70)
(130613, 70)


In [17]:
test_preds_all = []

In [18]:
## ashish_ndo

n_epochs = 5
seed = 6017
seed_everything(seed)
model = NeuralNet3()
model.cuda()

test_preds_fold = train_model_mean_a(model, x_train_a_tensor, y_train_tensor, validate=True)

test_preds_all.append(test_preds_fold)

del x_train_a_tensor, x_val_a_tensor, fasttext_embeddings

In [19]:
seed_everything()
fasttext_embeddings = load_fasttext_fast(tokenizer.word_index, max_features)

np.shape(fasttext_embeddings)

(95000, 300)

In [20]:
# fasttext

n_epochs = 5
seed = 6017
seed_everything(seed)
model = NeuralNet4()
model.cuda()

test_preds_fold = train_model_mean(model, x_train_tensor, y_train_tensor, validate=True)

test_preds_all.append(test_preds_fold)

del fasttext_embeddings

In [21]:
seed_everything()
glove_embeddings = load_glove(tokenizer.word_index, max_features)
paragram_embeddings = load_para(tokenizer.word_index, max_features)

embedding_matrix = np.mean([glove_embeddings, paragram_embeddings], axis=0)
np.shape(embedding_matrix)

(95000, 300)

In [22]:
## 128

n_epochs = 5
seed = 1029
seed_everything(seed)
model = NeuralNet1()
model.cuda()

test_preds_fold = train_model_mean(model, x_train_tensor, y_train_tensor, validate=True)

test_preds_all.append(test_preds_fold)

In [23]:
## me0.7

n_epochs = 6
seed = 1029
seed_everything(seed)
model = NeuralNet2()
model.cuda()

test_preds_fold = train_model_sum(model, x_train_tensor, y_train_tensor, validate=True)

test_preds_all.append(test_preds_fold)

In [24]:
# noattn

batch_size = 512
n_epochs = 6
seed = 6017
seed_everything(seed)
model = NeuralNet5()
model.cuda()

test_preds_fold = train_model_mean(model, x_train_tensor, y_train_tensor, validate=True)

test_preds_all.append(test_preds_fold)

In [25]:
from IPython.display import FileLink

start = time.time()
tests_preds = np.array(test_preds_all)

np.save('test_predicts.npy', tests_preds)
FileLink('test_predicts.npy')

In [26]:
# NN3 - ashish_ndo
# NN4 - fasttext
# NN1 - 128
# NN2 - me0.7
# NN5 - noattn

coeff_arr_old = [0.19944742, 0.11513072, 0.21428879, 0.29930394, 0.19598909]
# coeff_arr_new = [0.19598909, 0.29930394, 0.11513072, 0.21428879, 0.19944742]
test_preds = np.zeros((len(test_loader.dataset)))

i = 0
for t in test_preds_all:
    test_preds += t*coeff_arr_old[i]
    i += 1

In [27]:
threshold = 0.35000000000000003

submission = test_df[['qid']].copy()
submission['prediction'] = (test_preds > threshold).astype('int')
submission.to_csv('submission.csv', index=False)
np.mean(submission['prediction'])

0.06797942167819762