## IMPORTS 

In [165]:
import random
import pandas as pd
import numpy as np
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext


SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# load spacy tokenizer
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

### Ensure determinism in the results

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

## LOAD PROCESSED TRAINING DATA FROM DISK

In [4]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")
df = pd.concat([train ,test])

In [5]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# insincere questions: {:,}({:.2f}%) and # sincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# insincere questions: 1,225,312(93.81%) and # sincere questions: 80,810(6.19%)
# Test samples: 56,370(0.043% of train samples)


## Normalization

In [6]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text
def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")    

In [9]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

# vocab = build_vocab(df['question_text'])
# add_lower(embed_glove, vocab,embed_dict)

In [10]:
df_train["question_text"] = df_train["question_text"].fillna("_na_").values
df_test["question_text"] = df_test["question_text"].fillna("_na_").values

# df_train['question_text'] = df_train['question_text'].progress_apply(lambda x: x.lower())
df_train['question_text'] = df_train['question_text'].progress_apply(lambda x: clean_contractions(x, contraction_mapping))
# df_train['question_text'] =df_train['question_text'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df_train['question_text'] = df_train['question_text'].progress_apply(lambda x: correct_spelling(x, mispell_dict))

# df_test['question_text'] = df_test['question_text'].progress_apply(lambda x: x.lower())
df_test['question_text'] = df_test['question_text'].progress_apply(lambda x: clean_contractions(x, contraction_mapping))
# df_test['question_text'] =df_test['question_text'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df_test['question_text'] = df_test['question_text'].progress_apply(lambda x: correct_spelling(x, mispell_dict))

df_train.to_csv("train2.csv")
df_test.to_csv("test2.csv")

Progress: 100%|██████████| 1306122/1306122 [00:03<00:00, 356391.13it/s]
Progress: 100%|██████████| 1306122/1306122 [00:08<00:00, 150148.08it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 346537.62it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 148390.75it/s]


In [11]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

## Create a pytorch dataset from the train samples and build a vocabulary using embedding vectors

In [12]:
# # load dataframe to csv

TEXT = data.Field(lower=True, batch_first=True,tokenize='spacy')#preprocessing=generate_bigrams)
LABEL = data.LabelField(dtype=torch.float)
qid = data.Field()

train_dataset = data.TabularDataset(path='train2.csv', format='csv',
                                      fields={'question_text': ('text',TEXT),
                                              'target': ('label',LABEL)})
final_test_dataset = data.TabularDataset(path='test2.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text', TEXT)})

In [13]:
TEXT.build_vocab(train_dataset, final_test_dataset, min_freq=3)
qid.build_vocab(final_test_dataset)

In [14]:
TEXT.vocab.load_vectors(embed)

In [15]:
# TEXT.build_vocab(train_dataset, max_size=50000, vectors=vec)
LABEL.build_vocab(train_dataset)
TEXT.vocab.vectors.shape

torch.Size([71892, 300])

## SPLIT DATA TO TRAINiNG AND VALIDATION SETS

In [16]:
# train_dataset, test_dataset = train_dataset.split(split_ratio=0.9,random_state=random.seed(SEED))
train_dataset, valid_dataset = train_dataset.split(split_ratio=0.9,random_state=random.seed(SEED))

In [17]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(
    train_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),shuffle=True,sort=False, 
    device=device)
# test_iterator = data.BucketIterator(
#     test_dataset, 
#     batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),train=False,sort=False,
#     device=device)
valid_iterator = data.BucketIterator(
    valid_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),train=False,sort=False, 
    device=device)

In [52]:
#Simple attention layer taken from https://github.com/mttk/rnn-classifier/blob/master/model.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

RNNS = ['LSTM', 'GRU']

class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, nlayers=1, dropout=0.,
                   bidirectional=True, rnn_type='GRU'):
        super(Encoder, self).__init__()
        self.bidirectional = bidirectional
        assert rnn_type in RNNS, 'Use one of the following: {}'.format(str(RNNS))
        rnn_cell = getattr(nn, rnn_type) # fetch constructor from torch.nn, cleaner than if
        self.rnn = rnn_cell(embedding_dim, hidden_dim, nlayers, 
                            dropout=dropout, bidirectional=bidirectional)

    def forward(self, input, hidden=None):
        return self.rnn(input, hidden)


class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, keys, values):
        # Query = [BxQ] [512, 256]
        # Keys = [TxBxK] [50, 512, 256]
        # Values = [TxBxV]
        # Outputs = a:[TxB], lin_comb:[BxV]

        # Here we assume q_dim == k_dim (dot product attention)
        query_dim = query.shape[1]
        scale = 1. / math.sqrt(query_dim)

        query = query.unsqueeze(1)  # [BxQ] -> [Bx1xQ]
        keys = keys.transpose(0,1).transpose(1,2) # [TxBxK] -> [BxKxT]
        energy = torch.bmm(query, keys) # [Bx1xQ]x[BxKxT] -> [Bx1xT]
        energy = F.softmax(energy.mul_(scale), dim=2) # scale, normalize

        values = values.transpose(0,1) # [TxBxV] -> [BxTxV]
        linear_combination = torch.bmm(energy, values).squeeze(1) #[Bx1xT]x[BxTxV] -> [BxV]
        return energy, linear_combination

class Classifier(nn.Module):
    def __init__(self, embedding, encoder, attention, hidden_dim, num_classes):
        super(Classifier, self).__init__()
        self.embedding = embedding
        self.encoder = encoder
        self.attention = attention
        self.decoder = nn.Linear(hidden_dim, num_classes)

        size = 0
        for p in self.parameters():
            size += p.nelement()
        print('Total param size: {}'.format(size))


    def forward(self, input):
        outputs, hidden = self.encoder(self.embedding(input))
        if isinstance(hidden, tuple): # LSTM
            hidden = hidden[1] # take the cell state

        if self.encoder.bidirectional: # need to concat the last 2 hidden layers
            hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
        else:
            hidden = hidden[-1]

        # max across T?
        # Other options (work worse on a few tests):
        # linear_combination, _ = torch.max(outputs, 0)
        # linear_combination = torch.mean(outputs, 0)

        energy, linear_combination = self.attention(hidden, outputs, outputs) 
        logits = self.decoder(linear_combination)
        return logits, energy

In [53]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0)
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)

        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [161]:
class BiLSTM(nn.Module):
    def __init__(self, pretrained_lm, padding_idx, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(pretrained_lm)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)
        self.decoder = nn.Linear(64, 1) 
        self.fc = nn.Linear(hidden_dim*2,64)
        self.act = nn.ReLU() ## ADDED ACTIVATION FUNCTIONS
        self.sigmoid = nn.Sigmoid() ## ADDED ACTIVATION FUNCTIONS
        self.attention = Attention()

        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=256, kernel_size=(fs,self.embedding.embedding_dim)) for fs in [3,4,5]])


    def forward(self, sents):
        x = self.embedding(sents)
        x = self.dropout(x)
        x = torch.transpose(x, dim0=1, dim1=0)
#         self.out = nn.Linear(hidden_dim, num_classes)


        ##########################################################################
        embedded = x.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        ##########################################################################
        
        lstm_out, (h_n, c_n) = self.lstm(x)

        ######## Attention Layer #################################################
        hidden = (c_n)
        hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)

        energy, linear_combination = self.attention(hidden, lstm_out, lstm_out)

        linear_combination = self.act(self.fc(linear_combination)) ## ADDED ACTIVATION FUNCTIONS

        logits = self.decoder(self.dropout(linear_combination))
        
        ##########################################################################

        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
        return logits    


In [162]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
OUTPUT_DIM = 1
DROPOUT = 0.2

# model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = BiLSTM(TEXT.vocab.vectors, lstm_layer=2, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], hidden_dim=128,dropout=DROPOUT).cuda()
print(model)

BiLSTM(
  (dropout): Dropout(p=0.2)
  (embedding): Embedding(71892, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=2, dropout=0.2, bidirectional=True)
  (hidden2label): Linear(in_features=512, out_features=1, bias=True)
  (decoder): Linear(in_features=64, out_features=1, bias=True)
  (fc): Linear(in_features=256, out_features=64, bias=True)
  (act): ReLU()
  (sigmoid): Sigmoid()
  (attention): Attention()
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(5, 300), stride=(1, 1))
  )
)


In [163]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,e=epoch)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    if stop_training == 1:
        break

RuntimeError: invalid argument 0: Tensors must have same number of dimensions: got 2 and 3 at /opt/conda/conda-bld/pytorch_1544176307774/work/aten/src/THC/generic/THCTensorMath.cu:74

### Training

In [22]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0869,  0.1916,  0.1091,  ..., -0.0152,  0.1111,  0.2065],
        ...,
        [-0.2523, -0.1560, -0.0008,  ...,  0.0601,  0.3452,  0.2371],
        [ 0.0202,  0.1975, -0.0793,  ...,  0.0901, -0.6364,  0.2416],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [23]:
import torch.optim as optim

# optimizer = optim.Adam(model.parameters())
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                    lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y, th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def f1_score_model(preds, y,th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded = torch.sigmoid(preds).cpu().apply_(lambda x: 1 if x>=th else 0)
    rounded_preds = rounded

    return f1_score(y.cpu().numpy(),rounded_preds.cpu().numpy())


In [24]:
stop_training = 0
warmup_epoch = 2
step = 0
max_loss = 1e5
no_improve_in_previous_epoch = False
no_improve_epoch = 0
fine_tuning = False
train_record = []
val_record = []
losses = []

def train(model, iterator, optimizer, criterion,e):
    
    epoch_loss = 0
    epoch_acc = 0

    global warm_epoch
    global no_improve_in_previous_epoch
    global fine_tuning
    global step
    global max_loss
    global stop_training
    global no_improve_epoch
    global train_record
    global val_record
    global losses
    
    model.train()
    if e >= warmup_epoch:
        if no_improve_in_previous_epoch:
            no_improve_epoch += 1
            if no_improve_epoch >= 1:
                stop_training = 1
        else:
            no_improve_epoch = 0
        no_improve_in_previous_epoch = True
    if stop_training == 0:    
        if not fine_tuning and e >= warmup_epoch:
            model.embedding.weight.requires_grad = True        
            fine_tuning = True

        for batch in iterator:
            step += 1
            model.train()
            optimizer.zero_grad()
            predictions = model(batch.text).squeeze(1)       

            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            loss.backward()

            optimizer.step()
            if step % 500 == 0:
                model.eval()
                model.zero_grad()
                val_loss = []
                for val_batch in iter(valid_iterator):
                    val_x = val_batch.text.cuda()
                    val_y = val_batch.label.type(torch.Tensor).cuda()
                    val_pred = model.forward(val_x).view(-1)
                    val_loss.append(criterion(val_pred, val_y).cpu().data.numpy())
                val_record.append({'step': step, 'loss': np.mean(val_loss)})
                print('epoch {:02} - step {:06} - train_loss {:.4f} - val_loss {:.4f} '.format(
                            e+1, step, np.mean(losses), val_record[-1]['loss']))
                if e >= warmup_epoch:
                    if val_record[-1]['loss'] <= max_loss:
                        save(m=model, info={'step': step, 'epoch': e+1, 'train_loss': np.mean(losses),
                                            'val_loss': val_record[-1]['loss']})
                        max_loss = val_record[-1]['loss']
                        no_improve_in_previous_epoch = False

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_f1(model, iterator, criterion,th=0.5):
    
    f1_scores = []
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            f1 = f1_score_model(predictions, batch.label,th=th)
            f1_scores.append(f1)
        
    return np.array(f1_scores).mean()


def save(m, info):
    torch.save(info, 'best_model.info')
    torch.save(m, 'best_model.m')
    
def load():
    m = torch.load('best_model.m')
    info = torch.load('best_model.info')
    return m, info

In [37]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,e=epoch)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    if stop_training == 1:
        break

AttributeError: 'BiLSTM' object has no attribute 'convs'

In [26]:
model, m_info = load()
m_info

{'step': 11000, 'epoch': 5, 'train_loss': nan, 'val_loss': 0.09714787}

In [27]:
test_loss, test_acc = evaluate(model, valid_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.097 | Test Acc: 96.13% |


In [28]:
score = evaluate_f1(model, valid_iterator, criterion)
print(f'| f1 score: {score:.3f}')

| f1 score: 0.652


In [29]:
model.eval()
val_pred = []
val_true = []
valid_iterator.init_epoch()
for val_batch in iter(valid_iterator):
    val_x = val_batch.text.cuda()
    val_true += val_batch.label.cpu().data.numpy().tolist()
    val_pred += torch.sigmoid(model.forward(val_x).view(-1)).cpu().data.numpy().tolist()

In [34]:
tmp = [0,0,0] # idx, cur, max
delta = 0
for tmp[0] in np.arange(0.1, 0.501, 0.01):
    tmp[1] = f1_score(val_true, np.array(val_pred)>tmp[0])
    if tmp[1] > tmp[2]:
        delta = tmp[0]
        tmp[2] = tmp[1]
print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))

best threshold is 0.3200 with F1 score: 0.6930


In [31]:
model.eval()
model.zero_grad()
test_pred = []
test_id = []

final_test_iterator = torchtext.data.BucketIterator(dataset=final_test_dataset,
                                    batch_size=BATCH_SIZE,
                                    sort_key=lambda x: x.text.__len__(),train=False,sort=False)


for test_batch in iter(final_test_iterator):
    test_x = test_batch.text.cuda()
    test_pred += torch.sigmoid(model.forward(test_x).view(-1)).cpu().data.numpy().tolist()
    test_id += test_batch.qid.view(-1).data.numpy().tolist()

In [32]:
sub_df =pd.DataFrame()
sub_df['qid'] = [qid.vocab.itos[i] for i in test_id]
sub_df['prediction'] = (np.array(test_pred) >= delta).astype(int)

In [33]:
sub_df.to_csv("submission.csv", index=False)