In [None]:
import os
import gc
import sys
import json
import time
import math
import random
from datetime import datetime
from collections import Counter, defaultdict
import joblib
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class CFG:
    BATCH_SIZE=32
    NUM_WORKERS = 4
    WEIGHT_DECAY=1e-6
    LR=1e-4
    EPOCHS=20
    N_FOLDS=5
    N_LAYERS = 2
    SEQ_LEN = 200
    OUTPUT_SIZE = 1
    HIDDEN_DIM = 128

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
class AverageMeter:

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class TestToxicDataset:
    def __init__(self, toxic):
        self.toxic = toxic

    def __len__(self):
        return len(self.toxic)

    def __getitem__(self, item):
        toxic = self.toxic[item, :]
        
        return {
              "toxic": torch.tensor(toxic, dtype=torch.long),
        }

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
porter_stemmer = PorterStemmer()

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

def tokenization(text):
    tokens = text.split(' ')
    return tokens

def text_to_sequences(word2idx, seq):
    for i, sentence in enumerate(seq):
        seq[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    return seq

def pad_sequences(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

def preprocess(df):
    df['toxic_text']= df['text'].str.replace('\d+', '0')
    df['toxic_text']= df['toxic_text'].str.replace('\W+', ' ')
    df['toxic_text']= df['toxic_text'].apply(lambda x: tokenization(x))
    df['toxic_text']= df['toxic_text'].apply(lambda x:remove_stopwords(x))
    return df


In [None]:
def predict_one_step(model, data, device):
    for key, value in data.items():
        data[key] = value.to(device)
    toxic = data['toxic']   
    logit = model(toxic)
    return logit

 

def predict_one_epoch(model, test_loader, device):
    model.eval()
    predictions = []
    tk0 = tqdm(test_loader, total=len(test_loader))
    for idx, data in enumerate(tk0):
        with torch.no_grad():
            logit = predict_one_step(model, data, device)
        predictions.append(logit.view(-1).detach().cpu().numpy())
    return np.concatenate(predictions)


In [None]:
class ToxicModel(nn.Module):
    def __init__(self, output_size, embedding_matrix, hidden_dim, n_layers, drop_prob=0.3):
         super(ToxicModel, self).__init__()
         self.output_size = output_size
         self.n_layers = n_layers
         self.hidden_dim = hidden_dim
         num_words = embedding_matrix.shape[0]
         embed_dim =  embedding_matrix.shape[1]
        
         self.embedding = nn.Embedding(num_words, embed_dim)
        
         self.embedding.weight = nn.Parameter(
         torch.tensor(
         embedding_matrix,
         dtype=torch.float32
         )
         )     

         self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=drop_prob, bidirectional=True, batch_first=True)
         self.fc = nn.Linear(256*2, output_size)            
            
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to('cuda'),
                      weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to('cuda'))
        return hidden
    
    def forward(self, toxic):
        batch_size = toxic.size(0)
        h = self.init_hidden(batch_size)
        h = tuple([e.data for e in h])
        x = toxic.long()
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds, h)
        mean_  = torch.mean(lstm_out,1)
        max_ , _ = torch.max(lstm_out,1)
        out = torch.cat((mean_, max_), 1)
        out = self.fc(out)
        return out

In [None]:
def inference_fn(fold, seed, test_sentences, embedding_matrix):   
    #seed_everything(seed)

    test_dataset = TestToxicDataset(
         test_sentences,
        )
    
    test_loader = DataLoader(test_dataset,
                              batch_size=CFG.BATCH_SIZE,
                              shuffle=False,
                              num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=False)
    
    model = ToxicModel(CFG.OUTPUT_SIZE, embedding_matrix, CFG.HIDDEN_DIM, CFG.N_LAYERS)
    state = torch.load(f'../input/jigsaw-metadata/fold{fold}_best.pth', map_location=torch.device(device))
    model.load_state_dict(state['model'])
    model.to('cuda')
    
    preds = predict_one_epoch(model, test_loader, device)
    return preds

In [None]:
def run_inference():
    df = pd.read_csv(os.path.join('../input/jigsaw-toxic-severity-rating','comments_to_score.csv'))

    predictions = 0
    
    embedding_matrix = joblib.load('../input/jigsaw-metadata/embedding_matrix.pkl')
    word2idx = joblib.load('../input/jigsaw-metadata/word2idx.pkl')

    df = preprocess(df)

    test_sequenceses = text_to_sequences(word2idx, list(df['toxic_text']))
    test_sentences = pad_sequences(test_sequenceses, CFG.SEQ_LEN)
    for f in range(CFG.N_FOLDS):
        preds = inference_fn(f, 42, test_sentences, embedding_matrix)
        predictions +=preds
        
    return predictions

In [None]:
preds = run_inference()
sub = pd.read_csv(os.path.join('../input/jigsaw-toxic-severity-rating','sample_submission.csv'))
sub['score'] = preds
sub['score'] = sub['score'].rank(method='first')
sub.to_csv('submission.csv', index=False)