## 0.84+ score by ensemble of simple TF-Idf and Ridge regression

### Ensemble of TfIdf - Ridge models using data from 
- Toxic competition
- Toxic CLEANED competition
- Ruddit toxic data
- Toxic multilingual competition
- Toxic tweet data

### Analysis of bad predictions


#### Some cool starters notebooks : 
- https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768
- https://www.kaggle.com/steubk/jrsotc-ridgeregression-ensemble-of-3
- https://www.kaggle.com/samarthagarwal23/mega-b-ridge-to-the-top-lb-0-85/notebook

# Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

import re 
import scipy
from scipy import sparse
import gc 

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100


In [None]:
def timer(func):
    def wrapper(*args, **kws):
        st = time.time()
        res = func(*args, **kws)
        et = time.time()
        tt = (et-st)/60
        print(f'Time taken is {tt:.2f} mins')
        return res
    return wrapper


## Load Test, Validation data  


In [None]:
# validation data
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_val.shape

In [None]:
# Test data

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub.shape

# Create Sklearn Pipeline with 
-  TFIDF - Take 'char_wb' as analyzer to capture subwords well
-  Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

## Train pipeline

- Load folds data
- train pipeline
- Predict on validation data
- Predict on test data

# Training function

In [None]:
n_folds = 7

In [None]:
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    # Remove ip address
    data[col] = data[col].str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
    
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    
    return data

In [None]:
@timer
def train_pipeline(pipeline, data_path_name, n_folds, clean_prm = False):
    val_preds_arr1_tmp = np.zeros((df_val.shape[0], n_folds))
    val_preds_arr2_tmp = np.zeros((df_val.shape[0], n_folds))
    test_preds_arr_tmp = np.zeros((df_sub.shape[0], n_folds))

    for fld in range(n_folds):
        print("\n\n")
        print(f' ****************************** FOLD: {fld} ******************************')
        df = pd.read_csv(f'../input/jigsaw-folded-csv-files/{data_path_name}_fld{fld}.csv')
        print(df.shape)

        print("\nTrain:")
        # Train the pipeline
        pipeline.fit(df['text'], df['y'])

        # What are the important features for toxicity

        print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

        feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                      np.round(pipeline['clf'].coef_,2) )), 
                             key = lambda x:x[1], 
                             reverse=True)

        display(pd.DataFrame(feature_wts[:50], columns = ['feat','val']).T)
        #.plot('feat','val',kind='barh',figsize = (8,8) )
        #plt.show()

        if clean_prm:
            print("\npredict validation data ")
            val_preds_arr1_tmp[:,fld] = pipeline.predict(clean(df_val,'less_toxic')['less_toxic'])
            val_preds_arr2_tmp[:,fld] = pipeline.predict(clean(df_val,'more_toxic')['more_toxic'])

            print("\npredict test data ")
            test_preds_arr_tmp[:,fld] = pipeline.predict(clean(df_sub,'text')['text'])
        else:
            print("\npredict validation data ")
            val_preds_arr1_tmp[:,fld] = pipeline.predict(df_val['less_toxic'])
            val_preds_arr2_tmp[:,fld] = pipeline.predict(df_val['more_toxic'])

            print("\npredict test data ")
            test_preds_arr_tmp[:,fld] = pipeline.predict(df_sub['text'])
    return val_preds_arr1_tmp, val_preds_arr2_tmp, test_preds_arr_tmp

# Toxic Training

In [None]:
features = FeatureUnion([
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        ("clf", Ridge()),
    ]
)

val_preds_arr1, val_preds_arr2, test_preds_arr = train_pipeline(pipeline, 
                                                                "df", 
                                                                n_folds,
                                                                clean_prm=False)


# Toxic __clean__ Training

In [None]:
features = FeatureUnion([
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        ("clf", Ridge()),
    ]
)

val_preds_arr1c, val_preds_arr2c, test_preds_arrc = train_pipeline(pipeline, 
                                                                   "df_clean", 
                                                                   n_folds,
                                                                   clean_prm=True)


## Ruddit data Training

In [None]:
features = FeatureUnion([
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        ("clf", Ridge()),
    ]
)

val_preds_arr1_, val_preds_arr2_, test_preds_arr_ = train_pipeline(pipeline, 
                                                                   "dfr", 
                                                                   n_folds,
                                                                   clean_prm=False)


# Validate the pipeline 

In [None]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3 = val_preds_arr1_.mean(axis=1)
p4 = val_preds_arr2_.mean(axis=1)

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5 = val_preds_arr1c.mean(axis=1)
p6 = val_preds_arr2c.mean(axis=1)

print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')


## Optimize the model weights for ensemble

In [None]:
print("Find right weight")

wts_acc = []
for i in range(30,70,1):
    for j in range(0,20,1):
        w1 = i/100
        w2 = (100 - i - j)/100
        w3 = (1 - w1 - w2 )
        p1_wt = w1*p1 + w2*p3 + w3*p5
        p2_wt = w1*p2 + w2*p4 + w3*p6
        wts_acc.append( (w1,w2,w3, 
                         np.round((p1_wt < p2_wt).mean() * 100,2))
                      )
sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]
print(wts_acc[0])

In [None]:
w1, w2, w3, _ = sorted(wts_acc, key=lambda x:x[2], reverse=True)[0]

print(f'w1 : {w1}, w2: {w2}, w3: {w3}')

p1_wt = w1*p1 + w2*p3 + w3*p5
p2_wt = w1*p2 + w2*p4 + w3*p6

## Analyze bad predictions 
### Incorrect predictions with similar scores
### Incorrect predictions with different scores

In [None]:
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)

df_val['correct'] = (p1_wt < p2_wt).astype('int')


In [None]:

### Incorrect predictions with similar scores

df_val[(df_val.correct == 0) & (df_val.p1 < 0.5*df_val.p1.max())].sort_values('diff', ascending=True).head(20)

In [None]:
df_val[(df_val.correct == 0) & (df_val.p1 > 0.5*df_val.p1.max())].sort_values('diff', ascending=True).head(20)

#### Some of these just look incorrectly tagged 


In [None]:
### Incorrect predictions with dis-similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

In [None]:
df_val[(df_val.correct == 0) & (df_val['diff'] < 0.4*df_val['diff'].max())].sort_values('diff', ascending=False).head(20)


# Predict on test data 

In [None]:
# Predict using pipeline

df_sub['score'] = w1*test_preds_arr.mean(axis=1) + \
                  w2*test_preds_arr_.mean(axis=1) + \
                  w3*test_preds_arrc.mean(axis=1)

## Correct the rank ordering

In [None]:
# Cases with duplicates scores

df_sub['score'].count() - df_sub['score'].nunique()

In [None]:
same_score = df_sub['score'].value_counts().reset_index()[:10]
same_score

In [None]:
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

In [None]:
# Same comments have same score - which is ok 

In [None]:
# # Rank the predictions 

# df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

# print(df_sub['score'].rank().nunique())

# Ensemble DLM

In [None]:
import os
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

from tqdm import tqdm

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

CONFIG = dict(
        seed=42,
        test_batch_size=64,
        max_length=128,
        num_classes=1,
        device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    )

In [None]:
def set_model_name(model_code):
    
    if model_code == 'roberta':
        model_name = '../input/roberta-base'
        
    elif model_code == 'hatebert':
        model_name = '../input/d/withmaster/hatebert/GroNLP/model'

    CONFIG['model_name'] = model_name
    CONFIG['tokenizer'] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

def set_model_path_name(model_path_name):
    MODEL_PATHS = [
        f'../input/{model_path_name}/Loss-Fold-0.bin',
        f'../input/{model_path_name}/Loss-Fold-1.bin',
        f'../input/{model_path_name}/Loss-Fold-2.bin',
        f'../input/{model_path_name}/Loss-Fold-3.bin',
        f'../input/{model_path_name}/Loss-Fold-4.bin'
    ]
    
    return MODEL_PATHS

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                    text,
                    truncation=True,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    padding='max_length'
                )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids' : torch.tensor(ids, dtype=torch.long),
            'mask' : torch.tensor(mask, dtype=torch.long)
        }
    
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask,
                        output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs
    
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f'Getting predictions for model {i+1}')
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
        
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
df.head()

In [None]:
set_model_name('roberta')
MODEL_PATHS = set_model_path_name('../input/pytorch-w-b-jigsaw-starter')
set_seed(CONFIG['seed'])

test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                        num_workers=2, shuffle=False, pin_memory=True)
preds1 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

In [None]:
set_model_name('roberta')
MODEL_PATHS = set_model_path_name('../input/jigsaw-starter-roberta-rowruddit-mrl')
set_seed(CONFIG['seed'])

test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                        num_workers=2, shuffle=False, pin_memory=True)
preds2 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

In [None]:
set_model_name('roberta')
MODEL_PATHS = set_model_path_name('../input/jigsaw-starter-roberta-1st-mrl')

set_seed(CONFIG['seed'])

test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                        num_workers=2, shuffle=False, pin_memory=True)
preds3 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

In [None]:
preds = ((preds1 - preds1.min()) / (preds1.max() - preds1.min()) + (preds2 - preds2.min()) / (preds2.max() - preds2.min()) 
        + (preds3 - preds3.min()) / (preds3.max() - preds3.min()))

preds = preds/3

In [None]:
df_sub['score'] = df_sub['score']*0.85+preds*0.15

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)