# Inference kernel

### This is the Inference kernel to : [Toxic Trainer | FIT | Multi Label :)](https://www.kaggle.com/kishalmandal/toxic-trainer-fit-multi-label/edit/run/79270347)

In [None]:

import gc
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import transformers

from transformers import AdamW, AutoTokenizer, AutoModel
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
import time
from tqdm import tqdm

import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import re
from nltk.corpus import stopwords

In [None]:
class Config:
    model_name = '../input/roberta-base'
    batch_size = 64
    lr = 1e-4
    weight_decay = 0.01
    scheduler = 'CosineAnnealingLR'
    early_stopping_epochs = 1
    epochs = 20
    max_length = 128

In [None]:
class ToxicDataset:
    def __init__(self, comments, tokenizer, max_len=196):
        self.comments = comments
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        
        tokenized = self.tokenizer.encode_plus(
            self.comments[idx],
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )
        
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        

        return {
            'input_ids' : torch.tensor(input_ids, dtype=torch.long),
            'attention_mask' : torch.tensor(attention_mask, dtype=torch.long)
        }

In [None]:
class ToxicModel(nn.Module):
    def __init__(self, args):
        super(ToxicModel, self).__init__()
        self.args = args
        self.model = AutoModel.from_pretrained(self.args.model_name)
        self.dropout = nn.Dropout(p=0.2)
        self.toxic = nn.Linear(768, 1)
        self.stoxic = nn.Linear(768, 1)
        self.obs = nn.Linear(768, 1)
        self.threat = nn.Linear(768, 1)
        self.insult = nn.Linear(768, 1)
        self.id_hate = nn.Linear(768, 1)
    
        
    def forward(self, input_ids, attention_mask):
        
        out = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        
        out = self.dropout(out[1])
        
        toxic = self.toxic(out)
        stoxic = self.stoxic(out)
        obs = self.obs(out)
        threat = self.threat(out)
        insult = self.insult(out)
        id_hate = self.id_hate(out)

        return torch.cat([toxic, stoxic, obs, threat, insult, id_hate], dim=-1)
        

In [None]:
def get_predictions(args, dataloader, model):
    model.eval()
    all_outputs=[]
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    with torch.no_grad():
        for step, data in bar:
            batch_size = args.batch_size

            input_ids = data['input_ids'].cuda()
            attention_mask = data['attention_mask'].cuda()
            outputs = model(input_ids, attention_mask)
            outputs = outputs.cpu().detach().numpy()
            outputs = [sum(output) for output in outputs]
            all_outputs.append(outputs)

            bar.set_postfix(Stage='Inference') 
    return np.hstack(all_outputs)

In [None]:
def washing_machine(comments):
    corpus=[]
    for i in tqdm(range(len(comments))):
        comment = re.sub('[^a-zA-Z]', ' ', comments[i])
        comment = comment.lower()
        comment = comment.split()
        stemmer = SnowballStemmer('english')
        lemmatizer = WordNetLemmatizer()
        all_stopwords = stopwords.words('english')
        comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
        comment = [lemmatizer.lemmatize(word) for word in comment]
        comment = ' '.join(comment)
        corpus.append(comment)

    return corpus

# Inference

In [None]:
def inference(data):
    args=Config()
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    base_path= ['../input/2-folds-test/', '../input/robbaseuc/']
    
    dataset = ToxicDataset(data, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16*args.batch_size)
    
    final_preds = []
    
    
    ### 2 folds test ###
    num_folds = 5
    
    for fold in range(num_folds):
        model = ToxicModel(args)
        model = model.cuda()
        path = base_path[0] + f'model_fold_{fold}.bin'
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {fold+1}")
        preds = get_predictions(args, dataloader, model)
        final_preds.append(np.vstack(preds))
        del model
        gc.collect()
        
    ### roberta base un cleaned ###
    num_folds = 2
    for fold in range(num_folds):
        model = ToxicModel(args)
        model = model.cuda()
        path = base_path[1] + f'model_fold_{fold}.bin'
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {fold+1}")
        preds = get_predictions(args, dataloader, model)
        final_preds.append(np.vstack(preds))
        del model
        gc.collect()
        
    return np.hstack(sum(final_preds)/7)

# Validation

In [None]:
def show_validation():

    df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
#     com1 = washing_machine(df['less_toxic'].values)
#     com2 = washing_machine(df['more_toxic'].values)
    com1 = df['less_toxic'].values
    com2 = df['more_toxic'].values
    pred1 = inference(com1)
    pred2 = inference(com2)
    score=[]
    for o1,o2 in zip(pred1, pred2):
        if o1<o2:
            score.append(1)
        else:
            score.append(0)

    mean_score = np.mean(score)
    print('-'*50)
    print('Validation Score :',mean_score)

In [None]:
# show_validation()

# Prediction and submission

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
df.head()

In [None]:
comments = washing_machine(df['text'].values)

In [None]:
pred = inference(comments)

In [None]:
df['score'] = pred

In [None]:
df[['comment_id', 'score']].to_csv('submission.csv', index=False)