## IMPORTS 

In [1]:
import time
import random
import pandas as pd
import numpy as np
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext
import os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

Using TensorFlow backend.


### Ensure determinism in the results

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

### Code for Loading Embeddings

In [3]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

## LOAD PROCESSED TRAINING DATA FROM DISK

In [4]:
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

In [5]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")
df = pd.concat([df_train ,df_test],sort=True)

In [6]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# insincere questions: {:,}({:.2f}%) and # sincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# insincere questions: 1,225,312(93.81%) and # sincere questions: 80,810(6.19%)
# Test samples: 56,370(0.043% of train samples)


## Normalization

In [7]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text
def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")    

In [8]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

# vocab = build_vocab(df['question_text'])
# add_lower(embed_glove, vocab,embed_dict)

In [9]:
# df_train['question_text'] = df_train['question_text'].progress_apply(lambda x: x.lower())
df_train['question_text'] = df_train['question_text'].progress_apply(lambda x: clean_contractions(x, contraction_mapping))
# df_train['question_text'] =df_train['question_text'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df_train['question_text'] = df_train['question_text'].progress_apply(lambda x: correct_spelling(x, mispell_dict))

# df_test['question_text'] = df_test['question_text'].progress_apply(lambda x: x.lower())
df_test['question_text'] = df_test['question_text'].progress_apply(lambda x: clean_contractions(x, contraction_mapping))
# df_test['question_text'] =df_test['question_text'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df_test['question_text'] = df_test['question_text'].progress_apply(lambda x: correct_spelling(x, mispell_dict))

df_train.to_csv("train2.csv")
df_test.to_csv("test2.csv")

Progress: 100%|██████████| 1306122/1306122 [00:03<00:00, 331932.84it/s]
Progress: 100%|██████████| 1306122/1306122 [00:09<00:00, 133382.73it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 317529.25it/s]
Progress: 100%|██████████| 56370/56370 [00:00<00:00, 131969.02it/s]


In [10]:
# fill up the missing values
x_train = df_train["question_text"].fillna("_##_").values
x_test = df_test["question_text"].fillna("_##_").values

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# Pad the sentences 
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

# Get the target values
y_train = df_train['target'].values

In [11]:
# missing entries in the embedding are set using np.random.normal so we have to seed here too
seed_everything()

glove_embeddings = load_glove(tokenizer.word_index)
paragram_embeddings = load_para(tokenizer.word_index)

embedding_matrix = np.mean([glove_embeddings, paragram_embeddings], axis=0)
np.shape(embedding_matrix)

(95000, 300)

In [12]:
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=10).split(x_train, y_train))

## Create a pytorch dataset from the train samples and build a vocabulary using embedding vectors

## SPLIT DATA TO TRAINiNG AND VALIDATION SETS

In [13]:
#Simple attention layer taken from https://github.com/mttk/rnn-classifier/blob/master/model.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

RNNS = ['LSTM', 'GRU']

class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, nlayers=1, dropout=0.,
                   bidirectional=True, rnn_type='GRU'):
        super(Encoder, self).__init__()
        self.bidirectional = bidirectional
        assert rnn_type in RNNS, 'Use one of the following: {}'.format(str(RNNS))
        rnn_cell = getattr(nn, rnn_type) # fetch constructor from torch.nn, cleaner than if
        self.rnn = rnn_cell(embedding_dim, hidden_dim, nlayers, 
                            dropout=dropout, bidirectional=bidirectional)

    def forward(self, input, hidden=None):
        return self.rnn(input, hidden)


class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, keys, values):
        # Query = [BxQ] [512, 256]
        # Keys = [TxBxK] [50, 512, 256]
        # Values = [TxBxV]
        # Outputs = a:[TxB], lin_comb:[BxV]

        # Here we assume q_dim == k_dim (dot product attention)
        query_dim = query.shape[1]
        scale = 1. / math.sqrt(query_dim)

        query = query.unsqueeze(1)  # [BxQ] -> [Bx1xQ]
        keys = keys.transpose(0,1).transpose(1,2) # [TxBxK] -> [BxKxT]
        energy = torch.bmm(query, keys) # [Bx1xQ]x[BxKxT] -> [Bx1xT]
        energy = F.softmax(energy.mul_(scale), dim=2) # scale, normalize

        values = values.transpose(0,1) # [TxBxV] -> [BxTxV]
        linear_combination = torch.bmm(energy, values).squeeze(1) #[Bx1xT]x[BxTxV] -> [BxV]
        return energy, linear_combination

class Classifier(nn.Module):
    def __init__(self, embedding, encoder, attention, hidden_dim, num_classes):
        super(Classifier, self).__init__()
        self.embedding = embedding
        self.encoder = encoder
        self.attention = attention
        self.decoder = nn.Linear(hidden_dim, num_classes)

        size = 0
        for p in self.parameters():
            size += p.nelement()
        print('Total param size: {}'.format(size))


    def forward(self, input):
        outputs, hidden = self.encoder(self.embedding(input))
        if isinstance(hidden, tuple): # LSTM
            hidden = hidden[1] # take the cell state

        if self.encoder.bidirectional: # need to concat the last 2 hidden layers
            hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
        else:
            hidden = hidden[-1]

        # max across T?
        # Other options (work worse on a few tests):
        # linear_combination, _ = torch.max(outputs, 0)
        # linear_combination = torch.mean(outputs, 0)

        energy, linear_combination = self.attention(hidden, outputs, outputs) 
        logits = self.decoder(linear_combination)
        return logits, energy

In [14]:
class BiLSTM(nn.Module):
    def __init__(self, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding(max_features, embed_size)       
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        if static:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)
        self.decoder = nn.Linear(64, 1) 
        self.fc = nn.Linear(hidden_dim*2,64)
        self.act = nn.ReLU() ## ADDED ACTIVATION FUNCTIONS
        self.sigmoid = nn.Sigmoid() ## ADDED ACTIVATION FUNCTIONS
        self.attention = Attention()

        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=256, kernel_size=(fs,self.embedding.embedding_dim)) for fs in [3,4,5]])


    def forward(self, sents):
        x = self.embedding(sents)
        x = self.dropout(x)
        x = torch.transpose(x, dim0=1, dim1=0)
#         self.out = nn.Linear(hidden_dim, num_classes)
        
        lstm_out, (h_n, c_n) = self.lstm(x)

        ######## Attention Layer #################################################
        hidden = (c_n)
        hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)

        energy, linear_combination = self.attention(hidden, lstm_out, lstm_out)

        linear_combination = self.act(self.fc(linear_combination)) ## ADDED ACTIVATION FUNCTIONS

        logits = self.decoder(self.dropout(linear_combination))
        
        ##########################################################################

        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
        return logits    


In [15]:
# INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
OUTPUT_DIM = 1
DROPOUT = 0.2

# model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = BiLSTM(lstm_layer=2,hidden_dim=128,dropout=DROPOUT).cuda()
print(model)

BiLSTM(
  (dropout): Dropout(p=0.2)
  (embedding): Embedding(95000, 300)
  (lstm): LSTM(300, 128, num_layers=2, dropout=0.2, bidirectional=True)
  (hidden2label): Linear(in_features=512, out_features=1, bias=True)
  (decoder): Linear(in_features=64, out_features=1, bias=True)
  (fc): Linear(in_features=256, out_features=64, bias=True)
  (act): ReLU()
  (sigmoid): Sigmoid()
  (attention): Attention()
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(5, 300), stride=(1, 1))
  )
)


### Training

In [16]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

batch_size = 512 # how many samples to process at once
n_epochs = 6 # how many times to iterate over all samples

# matrix for the out-of-fold predictions
train_preds = np.zeros((len(df_train)))
# matrix for the predictions on the test set
test_preds = np.zeros((len(df_test)))

# always call this before training for deterministic results
seed_everything()

x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)


for i, (train_idx, valid_idx) in enumerate(splits):    
    # split data in train / validation according to the KFold indeces
    # also, convert them to a torch tensor and store them on the GPU (done with .cuda())
    x_train_fold = torch.tensor(x_train[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(y_train[train_idx, np.newaxis], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(x_train[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(y_train[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
#     model = NeuralNet()
    # make sure everything in the model is running on the GPU
    model.cuda()

    # define binary cross entropy loss
    # note that the model returns logit to take advantage of the log-sum-exp trick 
    # for numerical stability in the loss
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')
    optimizer = torch.optim.Adam(model.parameters())

    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(n_epochs):
        # set train mode of the model. This enables operations which are only applied during training like dropout
        start_time = time.time()
        model.train()
        avg_loss = 0.  
        for x_batch, y_batch in tqdm(train_loader, disable=True):
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)

            # Compute and print loss.
            loss = loss_fn(y_pred, y_batch)

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the Tensors it will update (which are the learnable weights
            # of the model)
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
        model.eval()
        
        # predict all the samples in y_val_fold batch per batch
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros((len(df_test)))
        
        avg_val_loss = 0.
        for i, (x_batch, y_batch) in enumerate(valid_loader):
            y_pred = model(x_batch).detach()
            
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
        
    # predict all samples in the test set batch per batch
    for i, (x_batch,) in enumerate(test_loader):
        y_pred = model(x_batch).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits)

Fold 1
Epoch 1/6 	 loss=63.4793 	 val_loss=54.4104 	 time=103.01s
Epoch 2/6 	 loss=54.3286 	 val_loss=51.3267 	 time=103.76s
Epoch 3/6 	 loss=51.0151 	 val_loss=50.4801 	 time=103.90s
Epoch 4/6 	 loss=48.4252 	 val_loss=49.6024 	 time=102.91s
Epoch 5/6 	 loss=46.0402 	 val_loss=49.9151 	 time=103.59s
Epoch 6/6 	 loss=43.7896 	 val_loss=51.0488 	 time=103.51s
Fold 2
Epoch 1/6 	 loss=44.3568 	 val_loss=38.8962 	 time=103.39s
Epoch 2/6 	 loss=42.1722 	 val_loss=40.0732 	 time=103.95s
Epoch 3/6 	 loss=40.0349 	 val_loss=41.1722 	 time=103.51s
Epoch 4/6 	 loss=37.9609 	 val_loss=42.2255 	 time=103.62s
Epoch 5/6 	 loss=36.3688 	 val_loss=42.9929 	 time=103.52s
Epoch 6/6 	 loss=34.7184 	 val_loss=44.8318 	 time=103.65s
Fold 3
Epoch 1/6 	 loss=37.3979 	 val_loss=27.4783 	 time=103.68s
Epoch 2/6 	 loss=35.2980 	 val_loss=29.1788 	 time=103.42s
Epoch 3/6 	 loss=33.8385 	 val_loss=31.0196 	 time=103.83s
Epoch 4/6 	 loss=32.3626 	 val_loss=32.1350 	 time=103.39s
Epoch 5/6 	 loss=31.2325 	 val_loss

In [17]:
tmp = [0,0,0] # idx, cur, max
delta = 0
for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
    tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
    if tmp[1] > tmp[2]:
        delta = tmp[0]
        tmp[2] = tmp[1]
print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))

100%|██████████| 41/41 [00:06<00:00,  6.77it/s]

best threshold is 0.3900 with F1 score: 0.7705





In [21]:
submission = df_test[['qid']].copy()
submission['prediction'] = (test_preds > delta).astype(int)
submission.to_csv('submission.csv', index=False)

In [22]:
!head submission.csv

qid,prediction
00014894849d00ba98a9,0
000156468431f09b3cae,0
000227734433360e1aae,0
0005e06fbe3045bd2a92,0
00068a0f7f41f50fc399,0
000a2d30e3ffd70c070d,0
000b67672ec9622ff761,0
000b7fb1146d712c1105,0
000d665a8ddc426a1907,0
