In [None]:
!pip install ../input/sacremoses/sacremoses-master/ > /dev/null
import sys
sys.path.insert(0, "../input/transformers/transformers-master/")

In [None]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import torch
import torch.utils.data as D
import tqdm as notebook
import numpy as np
import transformers
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
import keras.backend as K
import time
import gc
import random
import os
import torch
import warnings
import multiprocessing
import numpy as np
import pandas as pd
import tensorflow as tf
from contextlib import contextmanager
from logging import getLogger, Formatter, StreamHandler, FileHandler, INFO
from scipy.stats import spearmanr
from sklearn.model_selection import GroupKFold
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook
from transformers import BertPreTrainedModel, BertModel, BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
from scipy.special import expit
import tensorflow_hub as hub
from sklearn.preprocessing import OneHotEncoder
import re
import sys
import unicodedata
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
warnings.filterwarnings("ignore")

In [None]:
TARGET = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking',
          'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 
          'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions',
          'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
          'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']
COMPETITION_NAME = 'Google QUEST Q&A Labeling'
MODEL_NAME = 'v001'
logger = getLogger(COMPETITION_NAME)
LOGFORMAT = '%(asctime)s %(levelname)s %(message)s'
PATH = '../input/google-quest-challenge/'
N_SPLITS = 3
GROUP = 'question_body'
SEED = 396
cpu_count = multiprocessing.cpu_count()
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
PRETRAINED = '../input/distilbertbaseuncased'

In [None]:
def get_model_device(model):
    if not torch.cuda.is_available():
        return torch.device('cpu')
    else:
        device_num = next(model.parameters()).get_device()
        if device_num<0:
            return torch.device('cpu')
        else:
            return torch.device("cuda:{}".format(device_num))

class FastTokenIter(D.Dataset):
    def __init__(self, ds,max_len=512, batch_size=8,shuffle = False,return_order=False):
        self.ds = ds
        self.max_len=max_len
        self.batch_size=batch_size
        self.num_items = ds.__len__()
        self.len=int(np.ceil(float(self.num_items)/self.batch_size))
        list_items=[ds.__getitem__(i) for i in notebook.tqdm(range(ds.__len__()) ,leave=False)]
        self.items=[torch.cat([item[i][None] for item in list_items]) for i in range(len(list_items[0]))]
        self.item_len=self.items[1].sum(1)
        self.item_order = np.argsort(self.item_len.numpy())
        self.reorder=np.argsort(self.item_order)
        self.batch_order =np.arange(self.len)
        self.len_tuple=len(self.items)
        if shuffle:
            np.rand.shuffle(self.batch_order)
        self.return_order=return_order or shuffle
        self.idx=0
            
    def __iter__(self):
        self.idx = 0
        return self
    
    def __next__(self):
        if self.idx>=self.len:
            raise StopIteration
        sidx=self.batch_order[self.idx]
        self.idx+=1
        mlen=min(self.item_len[self.item_order[sidx*self.batch_size:(1+sidx)*self.batch_size]].max(),self.max_len)
        ret =tuple([self.items[i][self.item_order[sidx*self.batch_size:(1+sidx)*self.batch_size]][:,:mlen] for i in range(self.len_tuple)])
        return (self.item_order[sidx*self.batch_size:(1+sidx)*self.batch_size],)+ret if self.return_order else ret

def fetch_vectors_full(ds,model,batch_size=8,num_workers=8):
    device = get_model_device(model)
    fin_features=[]
    dl = FastTokenIter(ds, batch_size=batch_size, shuffle=False)
    _=model.eval()
    with torch.no_grad():
        for batch in notebook.tqdm(dl,total=dl.len,leave=False):
            fin_features.append(model( input_ids=batch[0].to(device), attention_mask=batch[1].to(device))[0][:, 0, :].detach().cpu().numpy())
    return np.vstack(fin_features)[dl.reorder]   

class TextDataset(D.Dataset):
    def __init__(self,text_list,tokenizer,max_len=512):
        self.text_list=text_list
        self.tokenizer = tokenizer
        self.max_len=max_len
    def __len__(self):
        return len(self.text_list)
    def __getitem__(self,idx):
        token_ids=self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(self.text_list[idx]))[:self.max_len-2]
        token_ids = [self.tokenizer.cls_token_id]+token_ids+[self.tokenizer.sep_token_id]
        token_ids_tensor=torch.zeros(self.max_len,dtype=torch.long)
        mask_tensor=torch.zeros(self.max_len,dtype=torch.long)
        token_type_tensor=torch.zeros(self.max_len,dtype=torch.long)
        token_ids[:len(token_ids)]=token_ids
        mask_tensor[:len(token_ids)]=1
        return tuple((token_type_tensor,mask_tensor,token_type_tensor))

In [None]:
CUSTOM_TABLE = str.maketrans(
    {
        "\xad": None,
        "\x7f": None,
        "\ufeff": None,
        "\u200b": None,
        "\u200e": None,
        "\u202a": None,
        "\u202c": None,
        "‘": "'",
        "’": "'",
        "`": "'",
        "“": '"',
        "”": '"',
        "«": '"',
        "»": '"',
        "ɢ": "G",
        "ɪ": "I",
        "ɴ": "N",
        "ʀ": "R",
        "ʏ": "Y",
        "ʙ": "B",
        "ʜ": "H",
        "ʟ": "L",
        "ғ": "F",
        "ᴀ": "A",
        "ᴄ": "C",
        "ᴅ": "D",
        "ᴇ": "E",
        "ᴊ": "J",
        "ᴋ": "K",
        "ᴍ": "M",
        "Μ": "M",
        "ᴏ": "O",
        "ᴘ": "P",
        "ᴛ": "T",
        "ᴜ": "U",
        "ᴡ": "W",
        "ᴠ": "V",
        "ĸ": "K",
        "в": "B",
        "м": "M",
        "н": "H",
        "т": "T",
        "ѕ": "S",
        "—": "-",
        "–": "-",
    }
)

WORDS_REPLACER = [
    ("sh*t", "shit"),
    ("s**t", "shit"),
    ("f*ck", "fuck"),
    ("fu*k", "fuck"),
    ("f**k", "fuck"),
    ("f*****g", "fucking"),
    ("f***ing", "fucking"),
    ("f**king", "fucking"),
    ("p*ssy", "pussy"),
    ("p***y", "pussy"),
    ("pu**y", "pussy"),
    ("p*ss", "piss"),
    ("b*tch", "bitch"),
    ("bit*h", "bitch"),
    ("h*ll", "hell"),
    ("h**l", "hell"),
    ("cr*p", "crap"),
    ("d*mn", "damn"),
    ("stu*pid", "stupid"),
    ("st*pid", "stupid"),
    ("n*gger", "nigger"),
    ("n***ga", "nigger"),
    ("f*ggot", "faggot"),
    ("scr*w", "screw"),
    ("pr*ck", "prick"),
    ("g*d", "god"),
    ("s*x", "sex"),
    ("a*s", "ass"),
    ("a**hole", "asshole"),
    ("a***ole", "asshole"),
    ("a**", "ass"),
]

REGEX_REPLACER = [
    (re.compile(pat.replace("*", "\*"), flags=re.IGNORECASE), repl)
    for pat, repl in WORDS_REPLACER
]

RE_SPACE = re.compile(r"\s")
RE_MULTI_SPACE = re.compile(r"\s+")

NMS_TABLE = dict.fromkeys(
    i for i in range(sys.maxunicode + 1) if unicodedata.category(chr(i)) == "Mn"
)

HEBREW_TABLE = {i: "א" for i in range(0x0590, 0x05FF)}
ARABIC_TABLE = {i: "ا" for i in range(0x0600, 0x06FF)}
CHINESE_TABLE = {i: "是" for i in range(0x4E00, 0x9FFF)}
KANJI_TABLE = {i: "ッ" for i in range(0x2E80, 0x2FD5)}
HIRAGANA_TABLE = {i: "ッ" for i in range(0x3041, 0x3096)}
KATAKANA_TABLE = {i: "ッ" for i in range(0x30A0, 0x30FF)}

TABLE = dict()
TABLE.update(CUSTOM_TABLE)
TABLE.update(NMS_TABLE)
# Non-english languages
TABLE.update(CHINESE_TABLE)
TABLE.update(HEBREW_TABLE)
TABLE.update(ARABIC_TABLE)
TABLE.update(HIRAGANA_TABLE)
TABLE.update(KATAKANA_TABLE)
TABLE.update(KANJI_TABLE)


def fix_tokens(tokens):
    """
    Expects a list of lower-cased tokens from TweeterTokenizer
    """
    for token in tokens:
        if token == "gov't" or token == "govt":
            yield "government"
        elif token == "i'm":
            yield "i"
            yield "am"
        elif token.endswith("n't"):
            yield token[:-3]
            yield "not"
        elif token.endswith("'re"):
            yield token[:-3]
            yield "are"
        elif token.endswith("'ll"):
            yield token[:-3]
            yield "will"
        elif token.endswith("'ve"):
            yield token[:-3]
            yield "have"
        elif token.endswith("'s"):
            yield token[:-2]
            yield "'s"
        else:
            yield token


def normalize(text: str) -> str:
    text = RE_SPACE.sub(" ", text)
    text = unicodedata.normalize("NFKD", text)
    text = text.translate(TABLE)
    text = RE_MULTI_SPACE.sub(" ", text).strip()

    for pattern, repl in REGEX_REPLACER:
        text = pattern.sub(repl, text)

    return text

In [None]:
embed = hub.load("../input/universalsentenceencoderlarge5")
K.clear_session()

In [None]:
train = pd.read_csv('../input/google-quest-challenge/' + 'train.csv')
test = pd.read_csv('../input/google-quest-challenge/' + 'test.csv')

In [None]:
for i in tqdm(['question_title','question_body','answer']):
    train[i] = train[i].apply(lambda x : normalize(x))
    test[i] = test[i].apply(lambda x : normalize(x))

In [None]:
question_title = np.vstack([embed([i]).numpy() for i in tqdm_notebook(train.question_title.tolist())])
K.clear_session()
question_body = np.vstack([embed([i]).numpy() for i in tqdm_notebook(train.question_body.tolist())])
K.clear_session()
answer = np.vstack([embed([i]).numpy() for i in tqdm_notebook(train.answer.tolist())])
K.clear_session()

In [None]:
question_title_ = np.vstack([embed([i]).numpy() for i in tqdm_notebook(test.question_title.tolist())])
K.clear_session()
question_body_ = np.vstack([embed([i]).numpy() for i in tqdm_notebook(test.question_body.tolist())])
K.clear_session()
answer_ = np.vstack([embed([i]).numpy() for i in tqdm_notebook(test.answer.tolist())])
K.clear_session()

In [None]:
del embed
K.clear_session()
torch.cuda.empty_cache()
gc.collect()

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [None]:
cuda.select_device(0) 

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained(PRETRAINED)
model = transformers.DistilBertModel.from_pretrained(PRETRAINED)
model.to('cuda:0')

In [None]:
!nvidia-smi

In [None]:
OHE = OneHotEncoder(sparse=False)
OHE.fit(train[['category','host']].append(test[['category','host']]).values)
DATA_ = [
    question_title,
    question_body,
    answer,
    OHE.transform(train[['category','host']]),
    fetch_vectors_full(TextDataset(train.question_title.to_list(),tokenizer,512),model,batch_size=8),
    fetch_vectors_full(TextDataset(train.question_body.to_list(),tokenizer,512),model,batch_size=8),
    fetch_vectors_full(TextDataset(train.answer.to_list(),tokenizer,512),model,batch_size=8)
]
DATA_TEST = [
    question_title_,
    question_body_,
    answer_,
    OHE.transform(test[['category','host']]),
    fetch_vectors_full(TextDataset(test.question_title.to_list(),tokenizer,512),model,batch_size=8),
    fetch_vectors_full(TextDataset(test.question_body.to_list(),tokenizer,512),model,batch_size=8),
    fetch_vectors_full(TextDataset(test.answer.to_list(),tokenizer,512),model,batch_size=8),
]

In [None]:
!nvidia-smi

In [None]:
TRAIN_ =  np.expand_dims(np.hstack(DATA_), 1)
TEST_ = np.expand_dims(np.hstack(DATA_TEST), 1)
TARGET_ = train[TARGET].values

In [None]:
LSTM_UNITS = 1024
DENSE_HIDDEN_UNITS =  4 * LSTM_UNITS
INPUT_SHAPE = TRAIN_.shape[2]


class MODEL_v001(nn.Module):
    def __init__(self, output=30):
        super().__init__()
        self.lstm1 = nn.GRU(
        INPUT_SHAPE , LSTM_UNITS, bidirectional=True, batch_first=True
        )
        self.lstm2 = nn.GRU(
            LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True
        )
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linearnorm = nn.LayerNorm(DENSE_HIDDEN_UNITS)
        self.linear_sub_out = nn.Linear(DENSE_HIDDEN_UNITS, output)

    def forward(self, x, lengths=None):
        h_lstm1, _ = self.lstm1(x)
        h_lstm2, _ = self.lstm2(h_lstm1)

        avg_pool1 = torch.mean(h_lstm1, 1)
        avg_pool2 = torch.mean(h_lstm2, 1)

        h_conc = torch.cat((avg_pool1, avg_pool2), 1)
        h_conc_linear1 = self.linearnorm(F.relu(self.linear1(h_conc)))
        hidden = h_conc + 2 * h_conc_linear1
        out = self.linear_sub_out(hidden)
        return out    
    
def init_logger():
    handler = StreamHandler()
    handler.setLevel(INFO)
    handler.setFormatter(Formatter(LOGFORMAT))
    fh_handler = FileHandler('{}.log'.format(MODEL_NAME))
    fh_handler.setFormatter(Formatter(LOGFORMAT))
    logger.setLevel(INFO)
    logger.addHandler(handler)
    logger.addHandler(fh_handler)


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    logger.info(f'[{name}] done in {time.time() - t0:.0f} s')


def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.compat.v1.set_random_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def folds(data):
    splits = list(GroupKFold(n_splits=N_SPLITS).split(X=data[GROUP], y=None, groups=data[GROUP]))
    return splits


def _parallel(func, list_):
    return Parallel(n_jobs=cpu_count)(delayed(func)(i) for i in tqdm(list_))


def evaluation(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)

In [None]:
batch_size = 8 # how many samples to process at once
n_epochs = 100 # how many times to iterate over all samples

In [None]:
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.from_numpy(TEST_)) ,batch_size=batch_size, shuffle=False, num_workers=cpu_count)

In [None]:
!nvidia-smi

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

In [None]:
seed_everything(SEED)
splits = folds(train)
# matrix for the out-of-fold predictions
train_preds = np.zeros((len(train), len(TARGET)))
test_preds = np.zeros((len(test), len(TARGET)))
# always call this before training for deterministic results
seed_everything(SEED)
score = []
for i, (train_idx, valid_idx) in enumerate(splits):
    seed_everything(SEED)
    model = MODEL_v001()
    model = model.cuda()
    loss_fn = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())
    y_train_torch = torch.tensor(TARGET_[train_idx])
    dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(TRAIN_[train_idx]), y_train_torch
    )
    train_loader = torch.utils.data.DataLoader(dataset ,batch_size=batch_size, shuffle=True, num_workers=cpu_count)
    y_train_torch = torch.tensor(TARGET_[valid_idx])
    dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(TRAIN_[valid_idx]), y_train_torch
    )
    valid_loader = torch.utils.data.DataLoader(dataset ,batch_size=batch_size, shuffle=False, num_workers=cpu_count)
    early_stopping = EarlyStopping(patience=5, verbose=True)
    print(f'Fold {i + 1}')
    
    for epoch in range(n_epochs):
        # set train mode of the model. This enables operations which are only applied during training like dropout
        start_time = time.time()
        model.train()
        avg_loss = 0.  
        for x, y in tqdm_notebook(train_loader): 
            x, y = x.float().cuda(), y.cuda()
            
            y_pred = model(x)

            # Compute and print loss.
            loss = loss_fn(y_pred, y)
            
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
        model.eval()
        
        valid_preds_fold = np.zeros((len(valid_idx), len(TARGET)))
        test_preds_fold = np.zeros((len(test), len(TARGET)))
        
        avg_val_loss = 0.
        for i, (x, y) in enumerate(valid_loader):
            x, y = x.float().cuda(), y.cuda()
            y_pred = model(x).detach()
            
            avg_val_loss += loss_fn(y_pred, y).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
        
        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
        
    valid_preds_fold = np.zeros((len(valid_idx), len(TARGET)))
    test_preds_fold = np.zeros((len(test), len(TARGET)))

    avg_val_loss = 0.
    for i, (x, y) in enumerate(valid_loader):
        x, y = x.float().cuda(), y.cuda()
        y_pred = model(x).detach()

        avg_val_loss += loss_fn(y_pred, y).item() / len(valid_loader)
        valid_preds_fold[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()

    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
        epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
    # load the last checkpoint with the best model
    model.load_state_dict(torch.load('checkpoint.pt'))
    # predict all samples in the test set batch per batch
    for i, (x_batch,) in enumerate(test_loader):
        x_batch = x_batch.float().cuda()
        y_pred = model(x_batch).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()

    train_preds[valid_idx] = valid_preds_fold
    score.append(evaluation(train[TARGET].iloc[valid_idx].values, train_preds[valid_idx]))
    print(score[-1])
    del model
    torch.cuda.empty_cache()
    test_preds += test_preds_fold / len(splits)

In [None]:
np.mean(score)

In [None]:
submission = pd.DataFrame(test_preds, columns=TARGET).rank() / len(test_preds)
submission['qa_id'] = test['qa_id'].values
submission.to_csv("submission.csv", index = False)