In [None]:
import re
import time
import gc
import random
import os
import math

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_fscore_support

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F

In [None]:
def seed_torch(seed=1130):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False

SEED = 1130
seed_torch(SEED)

In [None]:
embed_size = 300 # how big is each word vector
max_features = 196534 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 72 # max number of words in a question to use

log_interval = 250
batch_size = 512

In [None]:
import psutil
from multiprocessing import Pool

num_partitions = 20  # number of partitions to split dataframe
num_cores = psutil.cpu_count()  # number of cores on your machine

print('number of cores:', num_cores)
def df_parallelize_run(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
puncts = ['!', '?', '.', ',', '-', "'", '"', ':', ')', '(', '|', ';', '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '÷', 'π',
 'है', 'है',           
         ]

puncts_str = ''.join(puncts)[5:]

puncts = ['!', '?', '.', ',', '-']
def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def clean_quotation(x):
    x = str(x)
    if "'s" in x:
        x = x.replace("'s", "")
    if "s'" in x:
        x = x.replace("s'", "")
    if "n't" in x:
        x = x.replace("n't", " not")
    if "'ve" in x:
        x = x.replace("'ve", " have")
    return x

In [None]:
mispell_dict = {
    "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "i'd": "i would", "i'd've": "i would have",
    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
    "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
    "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", 
    "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
    "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center',
    'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
    'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize',
    'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 
    'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many',
    'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation',
    'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 
    'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 
    'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 
    'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 
    'quoran': 'quora', 'quorans': 'quora', 'brexit': 'british exit', 'quoras':'quora', 'fortnite': 'video game',
    'pubg': 'video game', 'redmi': 'cell phone', 'jinping': 'chinese president', 'lyft': 'uber', 'θ': 'theta', 'σ': 'sigma'
}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [None]:
def preprocess(text):
    """
    preprocess text main steps
    """
    text = text.lower()
    text = clean_text(text)
    text = replace_typical_misspell(text)
    text = clean_quotation(text)
    
    return text

def text_clean_wrapper(df):
    df["question_text"] = df["question_text"].apply(preprocess)
    return df

In [None]:
def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
#     # lower
#     train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
#     test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())
    
#     # Clean the text
#     train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
#     test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x))
    
#     # Clean spellings
#     train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
#     test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    
#     # Clean belong
#     train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_quotation(x))
#     test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_quotation(x))

    train_df = df_parallelize_run(train_df, text_clean_wrapper)
    test_df = df_parallelize_run(test_df, text_clean_wrapper)
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters=puncts_str)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    
    #shuffling the data
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, tokenizer.word_index

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    emb_mean, emb_std = -0.005838499, 0.48782197
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= len(word_index):
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    emb_mean, emb_std = -0.0053247833, 0.49346462
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8", errors='ignore') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_features:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_fasttext(word_index):
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    emb_mean, emb_std = -0.0033469985, 0.109855495
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= len(word_index):
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
from tqdm import tqdm
tqdm.pandas()

start_time = time.time()

train_X, test_X, train_y, word_index = load_and_prec()
embedding_matrix_1 = load_glove(word_index)
embedding_matrix_2 = load_para(word_index)
embedding_matrix_3 = load_fasttext(word_index)

total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))


# embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2), axis=1)
# embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2], axis=0)
print(np.shape(embedding_matrix_1))

In [None]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, input_size, hidden_size):
        super(RNNModel, self).__init__()
        self.rnn = getattr(nn, rnn_type)(input_size, hidden_size, bidirectional=True, batch_first=True)
    
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        return self.rnn(x)
    
class LSTM_TextCNN(nn.Module):
    def __init__(self, hidden_size, embedding_matrix, embed_size, embedding_dropout=0.5, initialization=True):
        super(LSTM_TextCNN, self).__init__()
        
        self.embedding = nn.Embedding(len(word_index), embed_size, padding_idx=0)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(embedding_dropout)
        
        self.lstm = RNNModel("LSTM", embed_size, hidden_size)
        self.lstm.init_weights()
        
        self.conv1 = nn.Conv1d(hidden_size*2, hidden_size, 1, bias=False)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.conv2 = nn.Conv1d(hidden_size*2, hidden_size, 2, bias=False)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.conv3 = nn.Conv1d(hidden_size*2, hidden_size, 3, bias=False)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        self.conv4 = nn.Conv1d(hidden_size*2, hidden_size, 4, bias=False)
        self.bn4 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU(inplace=True)
        
        self.merge = nn.Linear(hidden_size*4, hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.bn_merge = nn.BatchNorm1d(hidden_size)
        self.out = nn.Linear(hidden_size, 1)
        
        if initialization:
            self.init_weights()
            
    def init_weights(self):
        self.out.bias.data.fill_(0)
        nn.init.kaiming_normal_(self.out.weight)
        
    def forward(self, x):
        embed = self.embedding(x)
        out = torch.squeeze(
            self.embedding_dropout(torch.unsqueeze(embed, 0)))
        out, _ = self.lstm(embed)
        
        out = out.transpose(1, 2)
        
        out1 = self.bn1(self.relu(self.conv1(out)))
        out1 = F.adaptive_max_pool1d(out1, 1)
        
        out2 = self.bn2(self.relu(self.conv2(out)))
        out2 = F.adaptive_max_pool1d(out2, 1)
        
        out3 = self.bn3(self.relu(self.conv3(out)))
        out3 = F.adaptive_max_pool1d(out3, 1)
        
        out4 = self.bn4(self.relu(self.conv4(out)))
        out4 = F.adaptive_max_pool1d(out4, 1)
        
        conc = torch.squeeze(torch.cat((out1, out2, out3, out4), 1)) 
        conc = self.relu(self.merge(conc))
        conc = self.bn_merge(self.dropout(conc))
        out = self.out(conc)
        
        return out
    
class LSTM_GRU(nn.Module):
    def __init__(self, hidden_size, embedding_matrix, embed_size, embedding_dropout=0.1):
        super(LSTM_GRU, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding_matrix = embedding_matrix
        
        self.embedding = nn.Embedding(len(word_index), embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(embedding_dropout)
        self.lstm = RNNModel("LSTM", embed_size, hidden_size)
        self.lstm.init_weights()
        self.gru = RNNModel("GRU", hidden_size*2, hidden_size)
        self.gru.init_weights()
        
        self.linear = nn.Linear(4*hidden_size, 16)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(0.0)
        self.bn = nn.BatchNorm1d(16)
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)

        conc = torch.cat((avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(self.dropout(conc))
        out = self.out(conc)
        
        return out

In [None]:
splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED).split(train_X, train_y))

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def threshold_search_submit(y_proba, ratio):
    best_threshold = 0
    best_score = np.Inf
    for threshold in tqdm([i * 0.001 for i in range(1000)]):
        score = np.abs((y_proba > threshold).mean() - ratio)
        if score < best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'dist': best_score, 'ratio': ratio}
    return search_result

import matplotlib.pyplot as plt

def threshold_search(y_proba, y_true):
    best_threshold = 0
    best_score = 0
    scores = []
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        scores.append(score)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    
    plt.figure(figsize=(12,9))
    plt.plot([i * 0.01 for i in range(100)], scores)
    plt.plot(best_threshold, best_score, "xr", label="Best threshold")
    plt.show()
    return search_result

In [None]:
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler

class CosineLRWithRestarts():
    """Decays learning rate with cosine annealing, normalizes weight decay
    hyperparameter value, implements restarts.
    https://arxiv.org/abs/1711.05101
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        batch_size: minibatch size
        epoch_size: training samples per epoch
        restart_period: epoch count in the first restart period
        t_mult: multiplication factor by which the next restart period will extend/shrink
    Example:
        >>> scheduler = CosineLRWithRestarts(optimizer, 32, 1024, restart_period=5, t_mult=1.2)
        >>> for epoch in range(100):
        >>>     scheduler.step()
        >>>     train(...)
        >>>         ...
        >>>         optimizer.zero_grad()
        >>>         loss.backward()
        >>>         optimizer.step()
        >>>         scheduler.batch_step()
        >>>     validate(...)
    """

    def __init__(self, optimizer, batch_size, epoch_size, restart_period=100,
                 t_mult=2, last_epoch=-1, eta_threshold=1000, verbose=False):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_epoch == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an"
                                   " optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'],
                                 optimizer.param_groups))

        self.last_epoch = last_epoch
        self.batch_size = batch_size
        self.epoch_size = epoch_size
        self.eta_threshold = eta_threshold
        self.t_mult = t_mult
        self.verbose = verbose
        self.base_weight_decays = list(map(lambda group: group['weight_decay'],
                                           optimizer.param_groups))
        self.restart_period = restart_period
        self.restarts = 0
        self.t_epoch = -1

    def _schedule_eta(self):
        """
        Threshold value could be adjusted to shrink eta_min and eta_max values.
        """
        eta_min = 0
        eta_max = 1
        if self.restarts <= self.eta_threshold:
            return eta_min, eta_max
        else:
            d = self.restarts - self.eta_threshold
            k = d * 0.09
            return (eta_min + k, eta_max - k)

    def get_lr(self, t_cur):
        eta_min, eta_max = self._schedule_eta()

        eta_t = (eta_min + 0.5 * (eta_max - eta_min)
                 * (1. + math.cos(math.pi *
                                  (t_cur / self.restart_period))))

        weight_decay_norm_multi = math.sqrt(self.batch_size /
                                            (self.epoch_size *
                                             self.restart_period))
        lrs = [base_lr * eta_t for base_lr in self.base_lrs]
        weight_decays = [base_weight_decay * eta_t * weight_decay_norm_multi
                         for base_weight_decay in self.base_weight_decays]

        if self.t_epoch % self.restart_period < self.t_epoch:
            if self.verbose:
                print("Restart at epoch {}".format(self.last_epoch))
            self.restart_period *= self.t_mult
            self.restarts += 1
            self.t_epoch = 0

        return zip(lrs, weight_decays)

    def _set_batch_size(self):
        d, r = divmod(self.epoch_size, self.batch_size)
        batches_in_epoch = d + 2 if r > 0 else d + 1
        self.batch_increment = iter(torch.linspace(0, 1, batches_in_epoch))

    def step(self):
        self.last_epoch += 1
        self.t_epoch += 1
        self._set_batch_size()
        self.batch_step()

    def batch_step(self):
        t_cur = self.t_epoch + next(self.batch_increment)
        for param_group, (lr, weight_decay) in zip(self.optimizer.param_groups,
                                                   self.get_lr(t_cur)):
            param_group['lr'] = lr
            param_group['weight_decay'] = weight_decay

In [None]:
def f1_loss(logits, labels):
    __small_value = 1e-6
    beta = 1
    batch_size = logits.size()[0]
    p = torch.sigmoid(logits)
    l = labels
    num_pos = torch.sum(p, 1) + __small_value
    num_pos_hat = torch.sum(l, 1) + __small_value
    tp = torch.sum(l * p, 1)
    precise = tp / num_pos
    recall = tp / num_pos_hat
    fs = (1 + beta * beta) * precise * recall / (beta * beta * precise + recall + __small_value)
    loss = fs.sum() / batch_size
    return (1 - loss)

class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma

    def forward(self, logit, target):
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        if len(loss.size())==2:
            loss = loss.sum(dim=1)
        return loss.mean()

def lovasz_grad(gt_sorted):
    """
    Computes gradient of the Lovasz extension w.r.t sorted errors
    See Alg. 1 in paper
    """
    p = len(gt_sorted)
    gts = gt_sorted.sum()
    intersection = gts - gt_sorted.float().cumsum(0)
    union = gts + (1 - gt_sorted).float().cumsum(0)
    jaccard = 1. - intersection / union
    if p > 1: # cover 1-pixel case
        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
    return jaccard    

def lovasz_hinge_flat(logits, labels):
    """
    Binary Lovasz hinge loss
      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
      labels: [P] Tensor, binary ground truth labels (0 or 1)
      ignore: label to ignore
    """
    logits = logits.squeeze()
    labels = labels.squeeze()
    
    if len(labels) == 0:
        # only void pixels, the gradients should be 0
        return logits.sum() * 0.
    signs = 2. * labels.float() - 1.
    errors = (1. - logits * Variable(signs))
    errors_sorted, perm = torch.sort(errors, dim=0, descending=True) # bug
    perm = perm.data
    gt_sorted = labels[perm]
    grad = lovasz_grad(gt_sorted)
    # loss = torch.dot(F.relu(errors_sorted), Variable(grad))
    loss = torch.dot(F.elu(errors_sorted) + 1, Variable(grad))
    return loss

In [None]:
train_epochs = 4
train_preds = np.zeros((len(train_X)))
test_preds = np.zeros((len(test_X)))

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

In [None]:
for i, (train_idx, valid_idx) in enumerate(splits):
    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
    model1 = LSTM_TextCNN(64, np.concatenate((embedding_matrix_1, embedding_matrix_3), axis=1), embed_size*2, embedding_dropout=0.5)
    model2 = LSTM_GRU(64, np.mean([embedding_matrix_1, embedding_matrix_2], axis=0), embed_size, embedding_dropout=0.1)
    
    model1.cuda()
    model2.cuda()
    
    loss_fn1 = torch.nn.BCEWithLogitsLoss()
    loss_fn2 = f1_loss
    
    optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.0035)
    scheduler1 = CosineLRWithRestarts(optimizer1, batch_size, len(x_train_fold), restart_period=4, t_mult=1, verbose=True)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.0035)
    scheduler2 = CosineLRWithRestarts(optimizer2, batch_size, len(x_train_fold), restart_period=4, t_mult=1, verbose=True)
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(train_epochs):
        start_time = time.time()
        scheduler1.step()
        scheduler2.step()
        
        model1.train()
        model2.train()
        
        for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
            y_pred1 = model1(x_batch)
            y_pred2 = model2(x_batch)
            
            loss1 = loss_fn1(y_pred1, y_batch) + loss_fn2(y_pred1, y_batch)
            loss2 = loss_fn1(y_pred2, y_batch) + loss_fn2(y_pred2, y_batch)
            
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            
            loss1.backward()
            loss2.backward()
            
            optimizer1.step()
            optimizer2.step()
            
            scheduler1.batch_step()
            scheduler2.batch_step()
            
        model1.eval()
        model2.eval()
        
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test_X))
        
        for i, (x_batch, y_batch) in enumerate(valid_loader):
            y_pred1 = model1(x_batch).detach()
            y_pred2 = model2(x_batch).detach()
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = (
                sigmoid(y_pred1.cpu().numpy())[:, 0] + 
                sigmoid(y_pred2.cpu().numpy())[:, 0]
            ) / 2
        
        search_result = threshold_search(valid_preds_fold, train_y[valid_idx])
        valid_yhat_fold = (valid_preds_fold > search_result['threshold']).astype(int)
        valid_precision, valid_recall, valid_fscore, _ = precision_recall_fscore_support(train_y[valid_idx], valid_yhat_fold, average='binary')
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} | val_precision={:.5f} | val_recall={:.5f} | val_f1={:.5f} | threshold={:.2f} | time={:.2f}s'.format(
            epoch, train_epochs - 1, valid_precision, valid_recall, valid_fscore, search_result['threshold'], elapsed_time))
        
    for i, (x_batch,) in enumerate(test_loader):
        y_pred1 = model1(x_batch).detach()
        y_pred2 = model2(x_batch).detach()
        test_preds_fold[i * batch_size:(i+1) * batch_size] = (
            sigmoid(y_pred1.cpu().numpy())[:, 0] +
            sigmoid(y_pred2.cpu().numpy())[:, 0]
        ) / 2
    
    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits)

In [None]:
search_result = threshold_search(train_preds, train_y)
train_yhat = (train_preds > search_result['threshold']).astype(int)
train_precision, train_recall, train_fscore, _ = precision_recall_fscore_support(train_y, train_yhat, average='binary')
print('oof_precision={:.5f} | oof_recall={:.5f} | oof_f1={:.5f} | threshold={:.2f}'.format(train_precision, train_recall, train_fscore, search_result['threshold']))

In [None]:
ratio = (train_yhat.mean() + (test_preds > search_result['threshold']).mean()) / 2

search_result = threshold_search_submit(test_preds, ratio)
print(search_result)

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_preds > search_result['threshold']
sub.to_csv("submission.csv", index=False)