In [None]:
import os
import re
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings('ignore')

import transformers
from transformers import BertTokenizer,BertForSequenceClassification
from transformers import AutoTokenizer, AutoModel

from tqdm import tqdm
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

# Read Dataset

In [None]:
#this compe test data
test_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sample_submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')

#The data from the first jigsaw competition
#https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
jc_df_score = pd.read_csv('../input/jc-df-score/jc_df_score.csv') #for TFIDF

display(test_df)
display(sample_submission)
display(jc_df_score)

In [None]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything()

# Clean text

In [None]:
#https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)

    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text

    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


test_df['text']=test_df['text'].apply(text_cleaning)

# Model1. roberta

In [None]:
#roberta model
model_name="../input/roberta-base"
pretrained_weight="../input/roberta-pretrained"

In [None]:
max_len=256
output_logits=768

device=torch.device('cuda')

In [None]:
class TestDataset(Dataset):
    def __init__(self,text,tokenizer,max_length):
        self.text=text
        self.tokenizer=AutoTokenizer.from_pretrained(tokenizer)
        self.max_len=max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self,idx):
        text=self.text[idx]

        input=self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
        )

        ids=input['input_ids']
        attention_mask=input['attention_mask']

        return {
            'ids':torch.tensor(ids,dtype=torch.long),
            'attention_mask':torch.tensor(attention_mask,dtype=torch.long),
        }

In [None]:
class BertModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model=AutoModel.from_pretrained(model_name,return_dict=False)
        self.layer_norm = nn.LayerNorm(output_logits)
        self.dropout=nn.Dropout(0.5)
        self.dense = nn.Sequential(
            nn.Linear(output_logits, 256),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(0.5),
            nn.Linear(256, 1)
        )

    def forward(self,input_ids,attention_mask):
        last_hidden_state,pooler_output=self.model(input_ids,attention_mask)
        out = self.layer_norm(pooler_output)
        out=self.dropout(out)
        out=self.dense(out)
        return out

In [None]:
def test_fc(dataloader,model):
    model.eval()

    final_preds=[]

    for data in tqdm(dataloader):
        with torch.no_grad():

            input_ids=data['ids'].to(device,dtype=torch.long)
            attention_mask=data['attention_mask'].to(device,dtype=torch.long)

            output=model(
            input_ids,
            attention_mask,
            )

            output = output.squeeze(-1)
            final_preds.append(output.detach().cpu().numpy())

    final_preds = np.concatenate(final_preds)

    return final_preds

In [None]:
#predict

test_preds_1=[]

for fold in range(5):
    model=BertModel()
    model.to(device)
    model.load_state_dict(torch.load(f'{pretrained_weight}/model_{fold}.pth'))

    X_test=test_df['text'].values

    test_dataset=TestDataset(X_test,model_name,max_len)
    test_dataloader=DataLoader(test_dataset,shuffle=False)

    test_pred = test_fc(test_dataloader,model)
    test_preds_1.append(test_pred)

test_preds_1=np.mean(np.array(test_preds_1),axis=0)
test_preds_1

# Model2. TFIDF+ridge

In [None]:
#https://www.kaggle.com/vitaleey/tfidf-ridge/notebook#Compare-few-models

jc_df_score['comment_text']=jc_df_score['comment_text'].apply(text_cleaning)

#X,y split
X=jc_df_score['comment_text']
y=jc_df_score['toxic_score']

#embedding
vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5))
train_X = vectorizer.fit_transform(X)
test_X = vectorizer.transform(test_df['text'])

#train
tfidf_ridge_model=Ridge(random_state=42,alpha=0.8)
tfidf_ridge_model.fit(train_X,y)

#predict
test_preds_2 = tfidf_ridge_model.predict(test_X)
test_preds_2

# ensemble

In [None]:
#weighted average

ensumble_df=pd.DataFrame()
ensumble_df['score']=test_preds_1 * 0.892  + test_preds_2 * 0.108
ensumble_df

# submit

In [None]:
sub_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sub_df['score']=ensumble_df['score']
sub_df

In [None]:
sub_df[['comment_id', 'score']].to_csv("submission.csv", index=False)