<span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; color:chartreuse; border-style: inset; border-color: limegreen;border-radius:30px; text-align:center; border-width:8px; padding:20px;"> Huggingface + Deepspeed Starter</h1></span>

<h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Installing Contractions </h1>

In [None]:
!pip install --no-dependencies /kaggle/input/hf-deepspeed-jigsaw-starter-libraries/anyascii-0.3.0-py3-none-any.whl
!pip install --no-dependencies /kaggle/input/hf-deepspeed-jigsaw-starter-libraries/textsearch
!pip install --no-dependencies /kaggle/input/hf-deepspeed-jigsaw-starter-libraries/pyahocorasick
!pip install --no-dependencies /kaggle/input/hf-deepspeed-jigsaw-starter-libraries/contractions

<h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Redefineing Model </h1>

In [None]:
from transformers import RobertaTokenizer,RobertaModel
import torch
tokenizer = RobertaTokenizer.from_pretrained('/kaggle/input/hf-deepspeed-jigsaw-starter-libraries/tokenizer/')

class JigsawModel(torch.nn.Module):
    def __init__(self):
        super(JigsawModel, self).__init__()
        self.model = RobertaModel.from_pretrained('/kaggle/input/hf-deepspeed-jigsaw-starter-libraries/tokenizer/')
        self.fc = torch.nn.Linear(1024,1) #LazyLinear won't work with huggingface trainer
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        outputs = self.fc(out.pooler_output)
        return outputs

<h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center"> Loading Model </h1>

In [None]:
model = JigsawModel()
model.load_state_dict(torch.load('../input/hf-deepspeed-jigsaw-starter-training/results/pytorch_model.bin'))

<h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Inference Dataset </h1>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import re
import contractions

RE_COMBINATIONS = {
    '\n':' ',
    'https?://\S+|www\.\S+':' link ',
    '[/.]':' ',
    '[ .-]':' ',
    '([A-Za-z])\1{2,}':'\1',
    '([A-Za-z]{1,})([*!?\'])\2{2,}([A-Za-z]{1,})':'\1\2\3',
}

MAX_LENGTH = 190

In [None]:
class JigsawEvalDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def transform_text(self,text):
        #removing common errors specific to this dataset
        global RE_COMBINATIONS
        for i in RE_COMBINATIONS.items():
            text = re.sub(i[0],i[1],text)
        
        #uncontracting words :D
        text = ' '.join([contractions.fix(word) for word in text.lower().split()])
        
        #removing any punctuations
        text = list(map(lambda x:x if x.isalpha() else '',word_tokenize(text)))
        text = [word for word in text if word != '']
        
        return ' '.join(text)
    
    def __getitem__(self, index):
        text = self.transform_text(self.text[index])
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

<h1 style = "font-family: garamond; font-size: 35px; font-style: normal; letter-spcaing: 3px; border-width:5px; color:#90afc5; border-style: inset; border-radius: 100px 100px; border-color: #2a3132; padding:5px; text-align:center">  Validation Function </h1>

In [None]:
from tqdm import tqdm
import numpy as np
@torch.no_grad()
def valid_fn(model, dataloader):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].cuda()
        mask = data['mask'].cuda()
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    return PREDS

In [None]:
import pandas as pd
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

In [None]:
test_dataset = JigsawEvalDataset(df, max_length=MAX_LENGTH)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32,
                         num_workers=2, shuffle=False, pin_memory=True)

In [None]:
df['score'] = valid_fn(model.cuda(),test_loader)
df['score'] = df['score'].rank(method='first')
df.head()

In [None]:
df.drop('text', axis=1, inplace=True)
df.to_csv("submission.csv", index=False)