In [None]:
import numpy as np
import os
import time
import random
import pickle
import pandas as pd

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch.nn import functional as F

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

MODEL_WEIGHT_PATH =  '../input/jigsaw-toxicity/lstm_weights.pt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220
CLASSES_NUM = 6

sigmoid = nn.Sigmoid()
embedding_matrix = pickle.load(open('../input/jigsaw-toxicity/embedding_matrix.pickle', 'rb'))
tokenizer = pickle.load(open('../input/jigsaw-toxicity/tokenizer.pickle', 'rb'))

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

def clean_special_chars(text):
    text = text.replace('\n', ' ')
    text = text.replace('-', ' ')
    text = text.replace('?', ' question ')
    text = text.replace('!', ' exclamation ')
    return text.lower().translate(str.maketrans('', '', punct))


def preprocess(article):
    return clean_special_chars(article)


def preprocess_full_dataset(data):
    return list(map(preprocess, data))

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(MAX_LEN, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out


In [None]:
class RankingNet(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.lstm_model = model
        self.linears = nn.Sequential(
        nn.Linear(7, 1),
        nn.Sigmoid())
    
    def forward(self, x):
        return self.linears(self.lstm_model(x))

In [None]:
train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
len(train), len(test)

In [None]:
less_toxic = []
more_toxic = []
for less_item, more_item in tqdm(zip(train.less_toxic, train.more_toxic)):
  less_toxic.append(preprocess(less_item))
  more_toxic.append(preprocess(more_item))

In [None]:
less_toxic = tokenizer.texts_to_sequences(less_toxic)
less_toxic = sequence.pad_sequences(less_toxic, maxlen=MAX_LEN)

more_toxic = tokenizer.texts_to_sequences(more_toxic)
more_toxic = sequence.pad_sequences(more_toxic, maxlen=MAX_LEN)

In [None]:
x_train_less = torch.tensor(less_toxic, dtype=torch.long).cuda()
x_train_more = torch.tensor(more_toxic, dtype=torch.long).cuda()
y_train = torch.ones((len(x_train_more), 1), dtype=torch.long).cuda()

In [None]:
train_len = 25_000
all_dataset = TensorDataset(x_train_less, x_train_more, y_train)
train_dataset = TensorDataset(x_train_less[:train_len], x_train_more[:train_len], y_train[:train_len])
val_dataset = TensorDataset(x_train_less[train_len:], x_train_more[train_len:], y_train[train_len:])

In [None]:
criterion = nn.MarginRankingLoss(margin=0.5)

def training(model, epochs=1, batch_size=128, lr = 1e-3):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    all_loader = DataLoader(all_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        start_time = time.time()
        avg_loss = 0.
        val_avg_loss = 0.
        
        model.train()
        for data in tqdm(all_loader):
            optimizer.zero_grad()
            less, more, y = data
            y_less_pred = model(less)  
            y_more_pred = model(more)  
            loss = criterion(y_more_pred, y_less_pred, y)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(all_loader)
            
        model.eval()
        for data in tqdm(val_loader):
            less, more, y = data
            y_less_pred = model(less)  
            y_more_pred = model(more) 
            loss = criterion(y_more_pred, y_less_pred, y)
            val_avg_loss += loss.item() / len(val_loader)
        
            
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, epochs, avg_loss, val_avg_loss, elapsed_time))

In [None]:
seed_everything(1234)
    
pretrained_model = NeuralNet(embedding_matrix, CLASSES_NUM).cuda()
pretrained_model.load_state_dict(torch.load(MODEL_WEIGHT_PATH))
pretrained_model.eval()

In [None]:
for param in pretrained_model.parameters():
    param.requires_grad = False

In [None]:
seed_everything(1234)

model = RankingNet(pretrained_model).cuda()
training(model, epochs=5, batch_size=256, lr=1e-2)

In [None]:
test.head()

In [None]:
test_comments = []
for item in tqdm(test.text):
  test_comments.append(preprocess(item))
test_comments = tokenizer.texts_to_sequences(test_comments)
test_comments = sequence.pad_sequences(test_comments, maxlen=MAX_LEN)

In [None]:
x_test = torch.tensor(test_comments, dtype=torch.long).cuda()
test_dataset = TensorDataset(x_test)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

In [None]:
model.eval()
submission = []
for data in tqdm(test_loader):
    x = data[0]
    submission += list(model(x).flatten().cpu().detach().numpy())

In [None]:
submission_pd = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
submission_pd.score = submission

In [None]:
submission_pd.head()

In [None]:
submission_pd.to_csv('submission.csv', index=False)