In [None]:
import torch
import torch.nn as nn
import re
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import gc
import scipy
import os
import nltk
import itertools

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from transformers import BertModel, BertTokenizerFast, BertConfig, BertForSequenceClassification, TrainingArguments, Trainer

from datasets import Dataset, load_dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn import metrics, model_selection, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm
tqdm.pandas()
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_DISABLED"] = "true"

# 0. Introduction.

In this notebook I'm going to create an ensemble of different models to implement pointwise ranking approach. 
In other words, I'm going to find a regressor that predict toxicity for each document from the test dataset separately.

Here I'm going to create the next pipeline (though there are no pipelines in this notebook):
1. Split the test data into two groups (using some binary classifier toxic/nontoxic) and assign to each group some base toxicity score.
2. Measure the toxicity of all test data (using some regressor) and add it to the base score.

$$ \textbf{FinalRegressor}(x) = baseScore \cdot \textbf{BinClassifier}(x) + \textbf{SomeRegressor}(x)$$

## 0.1. Data for models.

1. Jigsaw toxic comment classification challenge competition data.
2. Ruddit data.

## 0.2. Steps.
1. Fine-tune model built on top of the pretrained BERT tokenizer to solve the binary classification task (nontoxic/toxic). For this purpose I will use Hugginfface library (dataset, Trainer, etc.)
2. Train ensemble of linear regressors (Ridge) on TF-IDF features (using differnet TfidfVectorizer parameters). 
3. Find the best $baseScore$ got at the first step using the competition validation data.


## 0.3. Text preprocessing methods.

Below I'm going to define all preprocessing methods that I found at some public notebooks and modified a bit (they are quite popular so I don't know their true origin).

In [None]:
RE_PATTERNS = {
    ' fuck':
        [
            '(f)([^a-z]*)(u)([^a-z]*)(c)([^a-z]*)(k)',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*k',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*[ck]+[a-z@#\$%\^&\*]*'
            'f u u c',
            '(f)(c|[^a-z ])(u|[^a-z ])(k)',
            'feck ', ' fux ', 'f\*\*', 
            'f\-ing', 'f\.u\.', 'f###', ' fu ', 'f@ck', 'f u c k', 'f uck\b', '\bf ck\b','\bfuk\b', 'wtf','fucck','f cking', 'fcking'
        ],

    ' ass ':
        [
            '[^a-z]ass ', '[^a-z]azz ', 'arrse', ' arse ', '@\$\$'
                                                           '[^a-z]anus', ' a\*s\*s', '[^a-z]ass[^a-z ]',
            'a[@#\$%\^&\*][@#\$%\^&\*]', '[^a-z]anal ', 'a s s'
        ],

    ' asshole ':
        [
            ' a[s|z]*wipe', 'a[s|z]*[w]*h[o|0]+[l]*e', '@\$\$hole', 'ass hole'
        ],

    ' bitch ':
        [
            'b[w]*i[t]*ch', 'b!tch',
            'bi\+ch', 'b!\+ch', '(b)([^a-z]*)(i)([^a-z]*)(t)([^a-z]*)(c)([^a-z]*)(h)',
            'biatch', 'bi\*\*h', 'bytch', 'b i t c h','beetch'
        ],

    ' bastard ':
        [
            'ba[s|z]+t[e|a]+rd'
        ],

    ' transgender':
        [
            'trans gender'
        ],

    ' cock ':
        [
            '[^a-z]cock', 'c0ck', '[^a-z]cok ', 'c0k', '[^a-z]cok[^aeiou]', ' cawk',
            '(c)([^a-z ])(o)([^a-z ]*)(c)([^a-z ]*)(k)', 'c o c k'
        ],

    ' dick ':
        [
            ' dick[^aeiou]', 'deek', 'd i c k','diick ', 'd+\s?[\*i1!-]+\s?[\*c-]+\s?[\*k-]+'
        ],

    ' suck ':
        [
            'sucker', '(s)([^a-z ]*)(u)([^a-z ]*)(c)([^a-z ]*)(k)', 'sucks', '5uck', 's u c k'
        ],

    ' cunt ':
        [
             'c u n t', '\sc+[ -]?[@u*]+[ -]*[n*-]{1,3}\s?[t*-]'
        ],

    ' bullshit ':
        [
            'bullsh\*t', 'bull\$hit'
        ],

    ' homosexual':
        [
            'homo sexual','homosex'
        ],


    ' idiot ':
        [
            'i[d]+io[t]+', '(i)([^a-z ]*)(d)([^a-z ]*)(i)([^a-z ]*)(o)([^a-z ]*)(t)', 'idiots', 'i d i o t'
        ],

    ' dumb ':
        [
            '(d)([^a-z ]*)(u)([^a-z ]*)(m)([^a-z ]*)(b)'
        ],

    ' shit ':
        [
            'shitty', 's[ -]?[h*][ -]*[i!*][ -]*t+', 'shite', '\$hit\b', 's h i t'
        ],

    ' shithole ':
        [
            'shythole','shit hole'
        ],

    ' retard ':
        [
            'returd', 'retad', 'retard', 'wiktard', 'wikitud'
        ],

    ' dumbass':
        [
            'dumb ass', 'dubass'
        ],

    ' asshead':
        [
            'butthead', 'ass head'
        ],

    ' sex ':
        [
            's3x',
        ],


    ' nigger ':
        [
            'nigger', 'ni[g]+a', ' nigr ', 'negrito', 'niguh', 'n3gr', 'n i g g e r', '\sn+[ -]?[i*]+[ -]*[g*-]{1,3}[ae*-]+\s?[r*]?'
        ],

    ' shut the fuck up':
        [
            'stfu'
        ],

    ' pussy ':
        [
            'pussy[^c]', 'pusy', 'pussi[^l]', 'pusses', '\sp+[ -]?[u*]+[ -]*[$s*-]{1,3}\s?[yi]'
        ],

    ' faggot ':
        [
            'faggot', ' fa[g]+[s]*[^a-z ]', 'fagot', 'f a g g o t', 'faggit',
            '(f)([^a-z ]*)(a)([^a-z ]*)([g]+)([^a-z ]*)(o)([^a-z ]*)(t)', 'fau[g]+ot', 'fae[g]+ot',
        ],

    ' motherfucker':
        [
            ' motha ', ' motha f', ' mother f', 'motherucker', 'mother fucker'
        ],

    ' whore ':
        [
            'wh\*\*\*', 'w h o r e', '\sw+[ -]?[h*]+[ -]*[o*-]{1,3}\s?[r*]+\s?[e*]?'
        ],
    ' kill ':
        [
            '\sk+[ -]?[!1i*]+[ -]*[1l*-]{1,3}'
        ],
    ' cocksucker ':
        [
            '\sc+[ -]?[!o0*]+[ -]?[c*-]{1,3}[ -]?[*k-]+[ -]?[s*]+[u*]+[ -]?[a-z]*'
        ]
}

In [None]:
# these methods are from https://www.kaggle.com/andre112/0-826-hate-speech-ridgeregression-ensemble

def replace_abbrev(text):
    text = re.sub(r"what's", "what is ",text)    
    text = re.sub(r"\'ve", " have ",text)
    text = re.sub(r"(\w+)(n't)", r"\1 not ",text)
    text = re.sub(r"i'm", "i am ",text)
    text = re.sub(r"\'re", " are ",text)
    text = re.sub(r"\'d", " would ",text)
    text = re.sub(r"\'ll", " will ",text)
    text = re.sub(r"\'scuse", " excuse ",text)
    text = re.sub(r"\'s", " ",text)
     # complete -ing
    text = re.sub(r'(\w+in)(\')(\s)', r'\1g\3', text)
    return text



def replace_multi_punc(text):
    text=re.sub(r'([.])\1\1{2,}',r' mpm ',text)
    text=re.sub(r'([!])\1\1{2,}',r' mxm ',text)
    text=re.sub(r'([?])\1\1{2,}',r' mqm ',text)
    text=re.sub(r'([*])\1\1{2,}',r'*',text)
    return text

def replace_url(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text



In [None]:
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)


def stem(text, stemmer=SnowballStemmer('english')):
    return ' '.join([stemmer.stem(t) for t in text.split()])    

def lemm(text, lemmatizer=WordNetLemmatizer()):
    tokens = nltk.tokenize.word_tokenize(text)                    
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def clean(data, stem_on=False, lemm_on=True) -> str:
    data = data.lower()
    data = data.strip("\" ")
    
    data = replace_abbrev(data)
    # remove User:
    data = re.sub(r"(u|U)ser:[a-zA-Z\d]{3,15}", 'stmsr', data)
    # remove Date
    data = re.sub(r"([\d]{1,2}\s([jJ]an|[fF]eb|[mM]ar|[aA]pr|[mM]ay|[jJ]un|[jJ]ul|[aA]ug|[sS]ep|[oO]ct|[nN]ov|[dD]ec),?[a-z]{0,6},?\s[\d]{4}(\s?\([a-zA-Z]{3}\))?)", 'dttm', data)
    # remove time
    data = re.sub(r'[\d]{2}:[\d]{2}','dttm', data)
    
    data = replace_url(data)
    # Clean some punctutations
    data = re.sub('\n', ' ', data)
    # Remove ip address
    data = re.sub(r'(([0-9]+\.){2,}[0-9]+)','stmip', data)
    
    # Replace repeating characters more than 3 times to length of 3
    data = replace_multi_punc(data)
    # patterns with repeating characters 
    data = re.sub(r'([a-z])\1{2,}\b',r'\1\1', data)
    data = re.sub(r'([a-z])\1\1{2,}\B',r'\1\1\1', data)
            
    
    for target, patterns in RE_PATTERNS.items():
        for pat in patterns:
            data = re.sub(pat, target, data)
        
    data = emoji_pattern.sub(r'', data)
    # remove all special characters
    data = re.sub(r"[^a-z.!?\']", " ", data)
    
    # remove extra spaces
    data = re.sub('\s+', ' ', data)
    
    # stem
    if stem_on:
        data = stem(data)
    
    if lemm_on:
        data = lemm(data)
        
    return data


def simple_clean(data, lemm_on):
    data = data.lower()
    data = data.strip("\" ")
    
    data = replace_abbrev(data)
    # remove User:
    data = re.sub(r"(u|U)ser:[a-zA-Z\d]{3,15}", 'stmsr', data)
    # remove all special characters
    data = re.sub(r"[^a-z.!?\']", " ", data)
    
    data = emoji_pattern.sub(r'', data)
    # remove extra spaces
    data = re.sub('\s+', ' ', data)
    if lemm_on:
        data = lemm(data)
        
    return data


# 1. BERT Binary Classifier.

In [None]:
BERT_NAME = '../input/d/xhlulu/huggingface-bert/bert-base-uncased'
MAX_LEN = 400
BATCH_SIZE = 8
VAL_SPLIT = 0.2
N_EPOCH = 14 * 3
DEVICE = 'cuda'
LEARNING_RATE = 5e-4

## 1.1. Data preparations.

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(BERT_NAME)
#tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_NAME)

def get_tokenize_func(column_name):
    def tokenize_func(examples):
        return tokenizer(examples[column_name], 
                         padding="max_length", 
                         truncation=True, 
                         add_special_tokens=True,
                         max_length=MAX_LEN,
                         return_token_type_ids=False,
                         return_attention_mask=True)
    return tokenize_func

def get_clean_column_func(column_name):
    def func(ds):
        ds[column_name] = clean(ds[column_name])
        return ds
    return func

### 1.1.1. Prepare the contest's test data.

In [None]:
contest_test_ds = load_dataset('csv', data_files='../input/jigsaw-toxic-severity-rating/comments_to_score.csv') 
contest_test_ds = contest_test_ds.map(get_clean_column_func('text'))
contest_test_ds = contest_test_ds.map(get_tokenize_func("text"), batched=False)

contest_test_ds['train'].set_format(type='torch', columns=['input_ids', 'attention_mask'])
contest_test_dataloader = torch.utils.data.DataLoader(contest_test_ds['train'], batch_size=32)

### 1.1.2. Prepare train data (jigsaw-toxic classification).

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
df = df.rename(columns={"comment_text": "comment"})
df = df.drop(columns=['id'])
df.reset_index(drop=True, inplace=True)

df['comment'] = df['comment'].progress_apply(clean)


In [None]:
def add_all(row):
    toxicity = row[1:].sum()
    if toxicity > 0:
        return 0
    else:
        return 1

df['nontoxic'] = df.progress_apply(add_all, axis='columns')
df.head()

In [None]:
lens = np.array(list(map(lambda x: len(x), df['comment'])))

In [None]:
new_df = pd.concat([df[np.logical_and(df['nontoxic'] == 1, lens > 45)].sample((df['nontoxic'] == 0).sum()), df[df['nontoxic'] == 0]])
new_df = new_df.sample(frac=1).reset_index(drop=True)

In [None]:
def gather_labels(row):
    return 1 - row

new_df['labels'] = new_df[['nontoxic']].progress_apply(gather_labels, axis=1)
new_df = new_df.drop(columns=['identity_hate', 'obscene', 'toxic', 'insult', 'threat', 'severe_toxic', 'nontoxic'])

In [None]:
classification_ds = Dataset.from_pandas(new_df)
classification_ds = classification_ds.train_test_split(VAL_SPLIT)
classification_ds = classification_ds.map(get_tokenize_func("comment"), batched=True)
classification_ds.set_format(type='torch', columns=['attention_mask', 'input_ids', 'labels'])

### 1.3. Contest validatation data.

In [None]:
contest_val_ds = load_dataset('csv', data_files='/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv') 
contest_val_ds = contest_val_ds.map(get_clean_column_func('less_toxic'))
contest_val_ds = contest_val_ds.map(get_clean_column_func('more_toxic'))

contest_val_ds = contest_val_ds.map(get_tokenize_func("less_toxic"), batched=False)
contest_val_ds = contest_val_ds.map(lambda example: {'less_attention_mask': example['attention_mask'], 'less_input_ids': example['input_ids']}, 
                                     remove_columns=['attention_mask', 'input_ids'])

contest_val_ds = contest_val_ds.map(get_tokenize_func("more_toxic"), batched=False)
contest_val_ds = contest_val_ds.map(lambda example: {'more_attention_mask': example['attention_mask'], 'more_input_ids': example['input_ids']}, 
                                     remove_columns=['attention_mask', 'input_ids'])

In [None]:
contest_val_ds['train'].set_format(type='torch', columns=['less_input_ids', 'less_attention_mask', 'more_input_ids', 'more_attention_mask'])
contest_val_dataloader = torch.utils.data.DataLoader(contest_val_ds['train'], batch_size=BATCH_SIZE)

## 1.2. BERT binary classifier training.

In [None]:
configuration = BertConfig(classifier_dropout=0.003)
#configuration = BertConfig()
configuration.num_labels = 2
model = BertForSequenceClassification.from_pretrained(BERT_NAME, config=configuration)

# freezing all layers but the classifier
for name, param in model.named_parameters():
    if 'classifier'  not in name:
        param.requires_grad = False
        
training_args = TrainingArguments("test_trainer",learning_rate=LEARNING_RATE, num_train_epochs=N_EPOCH, evaluation_strategy='epoch', save_strategy='epoch', load_best_model_at_end=True, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, save_total_limit=1)
trainer = Trainer(model=model, args=training_args, train_dataset=classification_ds['train'], eval_dataset=classification_ds['test'])
trainer.train()

In [None]:
gc.collect()
torch.cuda.empty_cache()

Now I'm going to get nontoxic/toxic classes for the validation and test data.

In [None]:
def validate_model(model, dataloader):
    loss = 0
    model.eval()
    model.to(DEVICE)
    less_classes = []
    more_classes = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(dataloader), total=len(dataloader)):
            less_classes += list(model(data['less_input_ids'].to(DEVICE), data['less_attention_mask'].to(DEVICE)).logits.cpu().detach().numpy())
            more_classes += list(model(data['more_input_ids'].to(DEVICE), data['more_attention_mask'].to(DEVICE)).logits.cpu().detach().numpy())
    return np.array([score.argmax() for score in less_classes], dtype=np.float), np.array([score.argmax() for score in more_classes], dtype=np.float)

bert_val_scores = validate_model(model, contest_val_dataloader)

In [None]:
model.eval()
model.to(DEVICE)
all_scores = []
with torch.no_grad():
    for _, data in tqdm(enumerate(contest_test_dataloader), total=len(contest_test_dataloader)):
        all_scores += list(model(data['input_ids'].to(DEVICE), data['attention_mask'].to(DEVICE)).logits.cpu().detach().numpy())
bert_test_scores = np.array([score.argmax() for score in all_scores])

# 2. Linear regressors on TF-IDF documents features.

## 2.1. Data preparations.

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
df = df.rename(columns={"comment_text": "comment"})
df = df.drop(columns=['id'])
df.reset_index(drop=True, inplace=True)

In [None]:
def add_all(row):
    toxicity = row[1:].sum()
    if toxicity > 0:
        return 0
    else:
        return 1

df['nontoxic'] = df.apply(add_all, axis='columns')

In [None]:
lens = np.array(list(map(lambda x: len(x), df['comment'])))
new_clf_df = pd.concat([df[np.logical_and(df['nontoxic'] == 1, lens > 25)].sample((df['nontoxic'] == 0).sum(), random_state=42), df[df['nontoxic'] == 0]])
new_clf_df = new_clf_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
new_clf_df['comment'] = new_clf_df['comment'].apply(clean)

In [None]:
labels = ['obscene', 'threat', 'insult', 'identity_hate', 'toxic', 'severe_toxic']
label_weights = np.array([0.06, 0.09, 0.13, 0.15, 0.45, 0.25])

In [None]:
def gather_labels(row):
    return np.dot(label_weights, np.array(row.values))

new_clf_df['score'] = new_clf_df[labels].apply(gather_labels, axis=1)

In [None]:
new_clf_df = new_clf_df.drop(columns=labels)
new_clf_df = new_clf_df.drop(columns=['nontoxic'])


In [None]:
df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
df = df[df['txt'] != '[deleted]']
df = df.drop(columns=['post_id', 'comment_id', 'url'])
df.reset_index(drop=True, inplace=True)
df = df.rename(columns={"txt": "comment", "offensiveness_score" : "score"})
df["comment"] = df["comment"].apply(clean)

df['score'] = df['score'].apply(lambda x: x if x > 0 else 0 )
ruddit_df = df
ruddit_df

In [None]:
val_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

val_df['less_toxic_clean'] = val_df['less_toxic'].apply(clean)
val_df['more_toxic_clean'] = val_df['more_toxic'].apply(clean)

## 2.2. Training.

In [None]:
def train_model(train_df, tfidf_vec):
    train_tfidf = tfidf_vec.transform(train_df['comment'].values.tolist())
    train_y = train_df["score"].values
    model = linear_model.Ridge(alpha=0.01)
    model.fit(train_tfidf, train_y)
    return model

In [None]:
def validate(model, tfidf_vec, val_df):
    comment1 = val_df['less_toxic_clean'].values
    comment2 = val_df['more_toxic_clean'].values

    comm1 = tfidf_vec.transform(comment1)
    comm2 = tfidf_vec.transform(comment2)

    pred1 = np.array(model.predict(comm1))
    pred2 = np.array(model.predict(comm2))

    t = sorted(np.abs(pred1 - pred2))
    score_diffs = np.array(t[1:]) - np.array(t[:-1])
    val_df2 = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

    updated_pred1 = pred1 
    updated_pred2 = pred2 
    
    return updated_pred1, updated_pred2

In [None]:
dataframes = [new_clf_df, ruddit_df]
params = [{'analyzer': 'char_wb', 'ngram_range': (3, 4), 'sublinear_tf': True}, {'analyzer': 'word', 'ngram_range': (1, 2), 'sublinear_tf': True}]
models = []
val_preds = []
vecs = []

In [None]:
for df in dataframes:
    for param in params:
        tfidf_vec = TfidfVectorizer(stop_words='english', **param)
        tfidf_vec.fit_transform(df['comment'].values.tolist())
        model = train_model(df, tfidf_vec)
        models.append(model)
        vecs.append(tfidf_vec)
        val_preds.append(validate(model, tfidf_vec, val_df))

## 3. BaseScore and regressors' weights tuning.

In [None]:
def vote(val_preds, model_weights, toxic_classes, base_score):
    res = np.stack([t[0]  + base_score * toxic_classes[0] < t[1] + base_score * toxic_classes[1] for t in val_preds], axis=1)
    threshold = len(val_preds) / 2
    
    votes = 0
    for r in res:
        if np.dot(r, np.array(model_weights)) >= np.dot(np.ones(len(r)), np.array(model_weights)) / 2:
            votes += 1
            
    return votes / len(res)

In [None]:
base_scores = np.linspace(0.15, 1, 3)
best_base_score = 0
max_val_score = 0
best_regr_weights = [0] * len(val_preds)
for base_score in base_scores:
    for model_weights in list(itertools.product(np.linspace(0.1, 1, 3), repeat=len(val_preds))):
        val_score = vote(val_preds, model_weights, bert_val_scores, base_score)
        if val_score > max_val_score:
            max_val_score = val_score
            best_base_score = base_score
            best_regr_weights = model_weights

In [None]:
print("Validation score: ", max_val_score, "\nBase score: ", best_base_score, "\nModel weights: ", best_regr_weights)

# 4. Final submission.

In [None]:
sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

sub['text'] = sub['text'].apply(clean)
sub['score'] = np.zeros(len(sub))
for i in range(len(models)):
    comms = vecs[i].transform(sub['text'].values)
    sub['score'] += np.array(models[i].predict(comms)) * best_regr_weights[i]

sub['score'] = sub['score'] / np.dot(np.ones(len(models)), np.array(best_regr_weights))


In [None]:
sub['score'] += bert_test_scores * best_base_score

In [None]:
sub['score']  = scipy.stats.rankdata(sub['score'], method='ordinal')
sub[['comment_id', 'score']].to_csv('submission.csv', index=False)