### Load Required Libraries 

In [1]:
import time
import random
import pandas as pd
import numpy as np
import gc
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext
import os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from torch.optim.optimizer import Optimizer
from unidecode import unidecode

Using TensorFlow backend.


In [2]:
def seed_everything(seed=1024):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [4]:
total = pd.concat([train, test])
total.shape

(1362492, 3)

In [5]:
def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [6]:
sentences = total["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

Progress: 100%|██████████| 1362492/1362492 [00:03<00:00, 352799.70it/s]
100%|██████████| 1362492/1362492 [00:03<00:00, 440624.48it/s]

{'How': 273144, 'did': 34918, 'Quebec': 102, 'nationalists': 97, 'see': 9397}





In [7]:
max_features = 120000
max_len = 72

### GOOGLE NEWS

In [9]:
from gensim.models import KeyedVectors

news_path = 'GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [8]:
import operator 

def check_coverage(vocab, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [11]:
oov = check_coverage(vocab, embeddings_index)

100%|██████████| 522569/522569 [00:00<00:00, 547433.66it/s]

Found embeddings for 24.05% of vocab
Found embeddings for  78.75% of all text





In [12]:
oov[:10]

[('to', 420476),
 ('a', 419837),
 ('of', 345145),
 ('and', 262815),
 ('India?', 17082),
 ('it?', 13436),
 ('do?', 9112),
 ('life?', 8074),
 ('you?', 6553),
 ('me?', 6485)]

In [13]:
'?' in embeddings_index

False

In [14]:
'&' in embeddings_index

True

### GLOVE

In [9]:
new_path = 'glove.840B.300d.txt'
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(new_path))

In [10]:
oov = check_coverage(vocab, embeddings_index)

100%|██████████| 522569/522569 [00:00<00:00, 942041.56it/s] 

Found embeddings for 32.91% of vocab
Found embeddings for  88.16% of all text





In [11]:
oov[:10]

[('India?', 17082),
 ('it?', 13436),
 ("What's", 12985),
 ('do?', 9112),
 ('life?', 8074),
 ('you?', 6553),
 ('me?', 6485),
 ('them?', 6421),
 ('time?', 5994),
 ('world?', 5632)]

In [12]:
'?' in embeddings_index

True

In [13]:
'&' in embeddings_index

True

### FAST TEXT

In [21]:
new_path = 'wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(new_path) if len(o)>100)

In [22]:
oov = check_coverage(vocab, embeddings_index)

100%|██████████| 522569/522569 [00:00<00:00, 975716.41it/s] 

Found embeddings for 29.77% of vocab
Found embeddings for  87.66% of all text





In [23]:
oov[:10]

[('India?', 17082),
 ("don't", 15642),
 ('it?', 13436),
 ("I'm", 13344),
 ("What's", 12985),
 ('do?', 9112),
 ('life?', 8074),
 ("can't", 7375),
 ('you?', 6553),
 ('me?', 6485)]

In [24]:
'?' in embeddings_index

True

In [25]:
'&' in embeddings_index

True

### PARA

In [26]:
new_path = 'paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(new_path, encoding="utf8", errors='ignore') if len(o)>100)

In [27]:
oov = check_coverage(vocab, embeddings_index)

100%|██████████| 522569/522569 [00:00<00:00, 977781.35it/s] 


Found embeddings for 19.42% of vocab
Found embeddings for  72.21% of all text


In [28]:
oov[:10]

[('What', 436013),
 ('I', 319441),
 ('How', 273144),
 ('Why', 148582),
 ('Is', 113627),
 ('Can', 54992),
 ('Which', 49357),
 ('Do', 41756),
 ('If', 35896),
 ('Are', 30442)]

### PREPROCESS

In [7]:
def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [8]:
total["question_text"] = total["question_text"].progress_apply(lambda x: clean_text(x))
sentences = total["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

Progress: 100%|██████████| 1362492/1362492 [00:07<00:00, 176414.59it/s]
100%|██████████| 1362492/1362492 [00:02<00:00, 493915.93it/s]


In [9]:
new_path = 'glove.840B.300d.txt'
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(new_path))

In [10]:
oov = check_coverage(vocab, embeddings_index)

100%|██████████| 259819/259819 [00:00<00:00, 916039.41it/s] 

Found embeddings for 70.96% of vocab
Found embeddings for  99.44% of all text





In [13]:
oov[:20]

[('Quorans', 885),
 ('Brexit', 510),
 ('cryptocurrencies', 506),
 ('Redmi', 394),
 ('OnePlus', 130),
 ('UCEED', 126),
 ('GDPR', 110),
 ('Blockchain', 110),
 ('demonetisation', 109),
 ('Coinbase', 105),
 ('BNBR', 104),
 ('Machedo', 103),
 ('Adityanath', 101),
 ('Boruto', 96),
 ('ethereum', 94),
 ('DCEU', 93),
 ('IIEST', 90),
 ('SJWs', 86),
 ('Qoura', 81),
 ('LNMIIT', 72)]

In [14]:
mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [15]:
total["question_text"] = total["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = total["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

Progress: 100%|██████████| 1362492/1362492 [00:25<00:00, 53723.31it/s]
Progress: 100%|██████████| 1362492/1362492 [00:03<00:00, 423386.95it/s]
100%|██████████| 1362492/1362492 [00:02<00:00, 508714.25it/s]


In [16]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 259726/259726 [00:00<00:00, 974089.26it/s] 

Found embeddings for 70.96% of vocab
Found embeddings for  99.44% of all text





### PREPARING DATA

In [65]:
from sklearn.preprocessing import StandardScaler

def add_features(df):
    
    df['question_text'] = df['question_text'].progress_apply(lambda x:str(x))
    df['total_length'] = df['question_text'].progress_apply(len)
    df['capitals'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/(float(row['total_length']) + 0.0000001),
                                axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / (df['num_words'] + 0.0000001)
    return df

def load_and_prec():
    
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    # lower
    train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
    test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

    # Clean the text
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))
    
    # Clean speelings
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values
    
    train = add_features(train_df)
    test = add_features(test_df)

    features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
    test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words = max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=max_len)
    test_X = pad_sequences(test_X, maxlen=max_len)

    ## Get the target values
    train_y = train_df['target'].values 
    
    #shuffling the data
    np.random.seed(1024)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, features, test_features, tokenizer.word_index

In [66]:
x_train, x_test, y_train, features, test_features, word_index = load_and_prec()

Train shape :  (1306122, 3)
Test shape :  (56370, 2)



Progress:   0%|          | 0/1306122 [00:00<?, ?it/s][A
Progress:   1%|          | 14346/1306122 [00:00<00:09, 143457.06it/s][A
Progress:   2%|▏         | 30733/1306122 [00:00<00:08, 149025.68it/s][A
Progress:   4%|▎         | 47806/1306122 [00:00<00:08, 154933.00it/s][A
Progress:   5%|▍         | 65268/1306122 [00:00<00:07, 160355.67it/s][A
Progress:   6%|▋         | 82716/1306122 [00:00<00:07, 164346.37it/s][A
Progress:   8%|▊         | 100270/1306122 [00:00<00:07, 167551.23it/s][A
Progress:   9%|▉         | 117600/1306122 [00:00<00:07, 169235.22it/s][A
Progress:  10%|█         | 134921/1306122 [00:00<00:06, 170406.50it/s][A
Progress:  12%|█▏        | 152596/1306122 [00:00<00:06, 172258.14it/s][A
Progress:  13%|█▎        | 170269/1306122 [00:01<00:06, 173574.71it/s][A
Progress:  14%|█▍        | 187715/1306122 [00:01<00:06, 173839.03it/s][A
Progress:  16%|█▌        | 204979/1306122 [00:01<00:06, 173475.38it/s][A
Progress:  17%|█▋        | 222143/1306122 [00:01<00:06, 172

Progress:  56%|█████▌    | 732960/1306122 [00:14<00:11, 50526.00it/s][A
Progress:  57%|█████▋    | 738014/1306122 [00:14<00:11, 50384.75it/s][A
Progress:  57%|█████▋    | 743104/1306122 [00:14<00:11, 50537.02it/s][A
Progress:  57%|█████▋    | 748252/1306122 [00:14<00:10, 50815.75it/s][A
Progress:  58%|█████▊    | 753335/1306122 [00:14<00:10, 50800.82it/s][A
Progress:  58%|█████▊    | 758428/1306122 [00:15<00:10, 50838.72it/s][A
Progress:  58%|█████▊    | 763513/1306122 [00:15<00:10, 50628.03it/s][A
Progress:  59%|█████▉    | 768634/1306122 [00:15<00:10, 50800.02it/s][A
Progress:  59%|█████▉    | 773799/1306122 [00:15<00:10, 51050.16it/s][A
Progress:  60%|█████▉    | 778989/1306122 [00:15<00:10, 51299.29it/s][A
Progress:  60%|██████    | 784120/1306122 [00:15<00:10, 51137.80it/s][A
Progress:  60%|██████    | 789235/1306122 [00:15<00:10, 50865.78it/s][A
Progress:  61%|██████    | 794323/1306122 [00:15<00:10, 50725.58it/s][A
Progress:  61%|██████    | 799397/1306122 [00:15<00

Progress:  14%|█▍        | 182572/1306122 [00:03<00:21, 51202.43it/s][A
Progress:  14%|█▍        | 188766/1306122 [00:03<00:20, 54010.67it/s][A
Progress:  15%|█▍        | 194947/1306122 [00:03<00:19, 55997.09it/s][A
Progress:  15%|█▌        | 201111/1306122 [00:03<00:19, 57544.14it/s][A
Progress:  16%|█▌        | 207272/1306122 [00:03<00:18, 58431.21it/s][A
Progress:  16%|█▋        | 213403/1306122 [00:03<00:18, 59089.29it/s][A
Progress:  17%|█▋        | 219613/1306122 [00:03<00:18, 59959.14it/s][A
Progress:  17%|█▋        | 225890/1306122 [00:03<00:17, 60773.35it/s][A
Progress:  18%|█▊        | 232072/1306122 [00:03<00:17, 61055.31it/s][A
Progress:  18%|█▊        | 238251/1306122 [00:04<00:17, 61110.57it/s][A
Progress:  19%|█▊        | 244414/1306122 [00:04<00:17, 61010.73it/s][A
Progress:  19%|█▉        | 250552/1306122 [00:04<00:17, 60995.33it/s][A
Progress:  20%|█▉        | 256677/1306122 [00:04<00:17, 60794.35it/s][A
Progress:  20%|██        | 262959/1306122 [00:04<00

Progress:   0%|          | 0/56370 [00:00<?, ?it/s][A
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 1128175.73it/s][A
Progress:   0%|          | 0/56370 [00:00<?, ?it/s][A
Progress:  42%|████▏     | 23755/56370 [00:00<00:00, 237539.47it/s][A
Progress:  78%|███████▊  | 44058/56370 [00:00<00:00, 226013.06it/s][A
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 210125.42it/s][A
Progress:   0%|          | 0/56370 [00:00<?, ?it/s][A
Progress:   9%|▉         | 5201/56370 [00:00<00:00, 51997.90it/s][A
Progress:  20%|█▉        | 11181/56370 [00:00<00:00, 54115.65it/s][A
Progress:  31%|███       | 17205/56370 [00:00<00:00, 55815.92it/s][A
Progress:  41%|████▏     | 23311/56370 [00:00<00:00, 57289.96it/s][A
Progress:  53%|█████▎    | 29623/56370 [00:00<00:00, 58922.57it/s][A
Progress:  64%|██████▍   | 35978/56370 [00:00<00:00, 60238.27it/s][A
Progress:  75%|███████▍  | 42088/56370 [00:00<00:00, 60491.76it/s][A
Progress:  85%|████████▌ | 47983/56370 [00:00<00:00, 60018.84

In [30]:
np.save("x_train",x_train)
np.save("x_test",x_test)
np.save("y_train",y_train)
np.save("word_index.npy",word_index)

In [33]:
def load_glove(word_index):
    EMBEDDING_FILE = 'glove.840B.300d.txt'
    
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = -0.005838499, 0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    
    # Why random embedding for OOV? what if use mean?
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    #embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, embed_size)) # std 0
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [67]:
seed_everything()
glove_embeddings = load_glove(word_index)

In [68]:
np.shape(glove_embeddings)

(120000, 300)

In [69]:
splits = list(StratifiedKFold(n_splits=3, shuffle=True, random_state=1024).split(x_train, y_train))
splits[:3]

[(array([      0,       1,       2, ..., 1306116, 1306118, 1306119]),
  array([      7,      11,      12, ..., 1306117, 1306120, 1306121])),
 (array([      0,       1,       3, ..., 1306118, 1306120, 1306121]),
  array([      2,       4,       5, ..., 1306115, 1306116, 1306119])),
 (array([      2,       4,       5, ..., 1306119, 1306120, 1306121]),
  array([      0,       1,       3, ..., 1306101, 1306103, 1306118]))]

In [40]:
# code inspired from: https://github.com/anandsaha/pytorch.cyclic.learning.rate/blob/master/cls.py
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [41]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

embedding_dim = 300
embedding_path = '../save/embedding_matrix.npy'  # or False, not use pre-trained-matrix
use_pretrained_embedding = True

hidden_size = 60
gru_len = hidden_size

Routings = 4 #5
Num_capsule = 5
Dim_capsule = 5#16
dropout_p = 0.25
rate_drop_dense = 0.28
LR = 0.001
T_epsilon = 1e-7
num_classes = 30


class Embed_Layer(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None, embedding_dim=300):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(vocab_size + 1, embedding_dim)
        if use_pretrained_embedding:
            self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix)) 

    def forward(self, x, dropout_p=0.25):
        return nn.Dropout(p=dropout_p)(self.encoder(x))


class GRU_Layer(nn.Module):
    def __init__(self):
        super(GRU_Layer, self).__init__()
        self.gru = nn.GRU(input_size=300,
                          hidden_size=gru_len,
                          bidirectional=True)
    
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        return self.gru(x)


# core caps_layer with squash func
class Caps_Layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Caps_Layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size 
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(t.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                t.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = t.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)  
        b = t.zeros_like(u_hat_vecs[:, :, :, 0])

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(t.einsum('bij,bijk->bik', (c, u_hat_vecs)))  # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = t.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  # batch matrix multiplication
        return outputs  # (batch_size, num_capsule, dim_capsule)

    # text version of squash, slight different from original one
    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = t.sqrt(s_squared_norm + T_epsilon)
        return x / scale
    
class Capsule_Main(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None):
        super(Capsule_Main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, vocab_size)
        self.gru_layer = GRU_Layer()
        self.gru_layer.init_weights()
        self.caps_layer = Caps_Layer()
        self.dense_layer = Dense_Layer()

    def forward(self, content):
        content1 = self.embed_layer(content)
        content2, _ = self.gru_layer(content1)  
        content3 = self.caps_layer(content2)
        output = self.dense_layer(content3)
        return output

In [56]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)
    
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        fc_layer = 16
        fc_layer1 = 16

        self.embedding = nn.Embedding(max_features, 300)
        self.embedding.weight = nn.Parameter(torch.tensor(glove_embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(300, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm_attention = Attention(hidden_size * 2, max_len)
        self.gru_attention = Attention(hidden_size * 2, max_len)
        self.bn = nn.BatchNorm1d(16, momentum=0.5)
        self.linear = nn.Linear(hidden_size*8+3, fc_layer1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(fc_layer**2,fc_layer)
        self.out = nn.Linear(fc_layer, 1)
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        self.caps_layer = Caps_Layer()
    
    def forward(self, x):
        
        #Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
        h_embedding = self.embedding(x[0])
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        ##Capsule Layer        
        content3 = self.caps_layer(h_gru)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))

        ##Attention Layer
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        
        f = torch.tensor(x[1], dtype=torch.float).cuda()

                #[512,160]
        conc = torch.cat((h_lstm_atten, h_gru_atten,content3, avg_pool, max_pool,f), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(conc)
        conc = self.dropout(conc)

        out = self.out(conc)
        
        return out

In [43]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __getitem__(self, index):
        data, target = self.dataset[index]
        return data, target, index
    def __len__(self):
        return len(self.dataset)

In [73]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

train_preds = np.zeros((len(x_train)))
test_preds = np.zeros((len(x_test)))

# always call this before training for deterministic results
seed_everything()

x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=64, shuffle=False)

avg_losses_f = []
avg_val_losses_f = []

for i, (train_idx, valid_idx) in enumerate(splits):    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    features = np.array(features)

    x_train_fold = torch.tensor(x_train[train_idx.astype(int)], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(y_train[train_idx.astype(int), np.newaxis], dtype=torch.float32).cuda()
    
    kfold_X_features = features[train_idx.astype(int)]
    kfold_X_valid_features = features[valid_idx.astype(int)]
    x_val_fold = torch.tensor(x_train[valid_idx.astype(int)], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(y_train[valid_idx.astype(int), np.newaxis], dtype=torch.float32).cuda()
    
    # model = BiLSTM(lstm_layer=2,hidden_dim=40,dropout=DROPOUT).cuda()
    model = NeuralNet()

    # make sure everything in the model is running on the GPU
    model.cuda()

    # define binary cross entropy loss
    # note that the model returns logit to take advantage of the log-sum-exp trick 
    # for numerical stability in the loss
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')

    step_size = 300
    base_lr, max_lr = 0.001, 0.003   
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=max_lr)

    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr, step_size=step_size, mode='exp_range', gamma=0.99994)

    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train = MyDataset(train)
    valid = MyDataset(valid)

    ##No need to shuffle the data again here. Shuffling happens when splitting for kfolds.
    train_loader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=64, shuffle=False)

    print(f'Fold {i + 1}')
    for epoch in range(6):
        # set train mode of the model. This enables operations which are only applied during training like dropout
        start_time = time.time()
        model.train()

        avg_loss = 0.  
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            # Forward pass: compute predicted y by passing x to the model.
            ################################################################################################            
            f = kfold_X_features[index]
            y_pred = model([x_batch, f])
            ################################################################################################

            ################################################################################################
            if scheduler:
                scheduler.batch_step()
            ################################################################################################


            # Compute and print loss.
            loss = loss_fn(y_pred, y_batch)

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the Tensors it will update (which are the learnable weights
            # of the model)
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
        model.eval()
        
        # predict all the samples in y_val_fold batch per batch
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros((len(x_test)))
        
        avg_val_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = kfold_X_valid_features[index]
            y_pred = model([x_batch, f]).detach()
            
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * 64:(i+1) * 64] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1, 6, avg_loss, avg_val_loss, elapsed_time))
    avg_losses_f.append(avg_loss)
    avg_val_losses_f.append(avg_val_loss) 
    # predict all samples in the test set batch per batch
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * 64:(i+1) * 64]
        y_pred = model([x_batch, f]).detach()

        test_preds_fold[i * 64:(i+1) * 64] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits)

print('All \t loss={:.4f} \t val_loss={:.4f} \t '.format(np.average(avg_losses_f),np.average(avg_val_losses_f)))

Fold 1


  


Epoch 1/6 	 loss=8.5056 	 val_loss=6.6232 	 time=245.40s
Epoch 2/6 	 loss=7.3858 	 val_loss=6.4139 	 time=254.25s
Epoch 3/6 	 loss=6.9557 	 val_loss=6.3291 	 time=247.63s
Epoch 4/6 	 loss=6.5649 	 val_loss=6.5784 	 time=246.74s
Epoch 5/6 	 loss=6.1975 	 val_loss=6.5116 	 time=246.22s
Epoch 6/6 	 loss=5.9040 	 val_loss=6.9627 	 time=245.78s
Fold 2
Epoch 1/6 	 loss=8.3089 	 val_loss=7.0705 	 time=250.05s
Epoch 2/6 	 loss=7.3184 	 val_loss=6.5017 	 time=245.79s
Epoch 3/6 	 loss=6.8889 	 val_loss=6.6818 	 time=245.70s
Epoch 4/6 	 loss=6.5237 	 val_loss=6.8291 	 time=246.65s
Epoch 5/6 	 loss=6.1840 	 val_loss=7.2919 	 time=245.77s
Epoch 6/6 	 loss=5.8462 	 val_loss=6.9873 	 time=245.79s
Fold 3
Epoch 1/6 	 loss=8.3334 	 val_loss=6.6513 	 time=245.49s
Epoch 2/6 	 loss=7.3470 	 val_loss=6.5372 	 time=245.66s
Epoch 3/6 	 loss=6.8998 	 val_loss=6.6104 	 time=247.41s
Epoch 4/6 	 loss=6.5283 	 val_loss=6.5940 	 time=248.40s
Epoch 5/6 	 loss=6.2000 	 val_loss=6.9909 	 time=248.06s
Epoch 6/6 	 loss=

In [74]:
def bestThresshold(y_train,train_preds):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
    return delta
delta = bestThresshold(y_train,train_preds)


  0%|          | 0/41 [00:00<?, ?it/s][A
  2%|▏         | 1/41 [00:00<00:06,  5.94it/s][A
  5%|▍         | 2/41 [00:00<00:06,  5.97it/s][A
  7%|▋         | 3/41 [00:00<00:06,  6.01it/s][A
 10%|▉         | 4/41 [00:00<00:06,  6.06it/s][A
 12%|█▏        | 5/41 [00:00<00:05,  6.12it/s][A
 15%|█▍        | 6/41 [00:00<00:05,  6.16it/s][A
 17%|█▋        | 7/41 [00:01<00:05,  6.15it/s][A
 20%|█▉        | 8/41 [00:01<00:05,  6.01it/s][A
 22%|██▏       | 9/41 [00:01<00:05,  5.98it/s][A
 24%|██▍       | 10/41 [00:01<00:05,  6.04it/s][A
 27%|██▋       | 11/41 [00:01<00:04,  6.07it/s][A
 29%|██▉       | 12/41 [00:01<00:04,  6.12it/s][A
 32%|███▏      | 13/41 [00:02<00:04,  6.10it/s][A
 34%|███▍      | 14/41 [00:02<00:04,  6.14it/s][A
 37%|███▋      | 15/41 [00:02<00:04,  6.18it/s][A
 39%|███▉      | 16/41 [00:02<00:04,  6.18it/s][A
 41%|████▏     | 17/41 [00:02<00:03,  6.19it/s][A
 44%|████▍     | 18/41 [00:02<00:03,  6.25it/s][A
 46%|████▋     | 19/41 [00:03<00:03,  6.28it/s]

best threshold is 0.2700 with F1 score: 0.6683


In [77]:
df_test = pd.read_csv('test.csv')

In [78]:
submission = df_test[['qid']].copy()
submission['prediction'] = (test_preds > delta).astype(int)
submission.to_csv('submission.csv', index=False)