### Training an ensemble of three TF-IDF linear models using the dataset described in this [paper](https://arxiv.org/pdf/2009.10277.pdf)
Original dataset link [here](https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech)

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import rankdata

# Read data

In [None]:
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
hspeech = pd.read_csv('../input/measuring-hate-speech/measuring_hate_speech.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
# get mean scores for each comment_id
scores_dict = hspeech.groupby('comment_id')['hate_speech_score'].apply(np.mean).to_dict()

# drop duplicate comment_ids
hspeech = hspeech.drop_duplicates(subset='comment_id')
hspeech['hate_speech_score'] = hspeech['comment_id'].map(scores_dict)

# Plots

## Plot Hate Speech Scores Histogram

In [None]:
hspeech['hate_speech_score'].plot.hist(bins=100, title='Hate speech scores')

## Compare text size with competition validation data

In [None]:
hspeech['text'].apply(lambda x: len(x.split())).plot.hist(bins=100)
pd.concat([
    validation['less_toxic'],
    validation['more_toxic']
]).to_frame('text')['text'].apply(lambda x: len(x.split())).plot.hist(bins=100, alpha=0.5, figsize=(14, 7), title='Text size') #blue validation - orange hspeech data

# Text cleaning

The text is already quite clean but some extra pre-processing is added:
* Remove URL with 'url'
* Remove unicode strings
* Remove numbers
* Lemmatization

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
def replaceURL(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

def replaceAbbrev(text):
    text = re.sub(r"what's", "what is ",text)    
    text = re.sub(r"\'ve", " have ",text)
    text = re.sub(r"can't", "cannot ",text)
    text = re.sub(r"n't", " not ",text)
    text = re.sub(r"i'm", "i am ",text)
    text = re.sub(r"\'re", " are ",text)
    text = re.sub(r"\'d", " would ",text)
    text = re.sub(r"\'ll", " will ",text)
    text = re.sub(r"\'scuse", " excuse ",text)
    text = re.sub(r"\'s", " ",text)
    return text

def removeUnicode(text):
    """ Removes unicode strings like "\u002c" and "x96" """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r' ', text)       
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    return text
def removeRepeatPattern(text):
    text=re.sub(r'([a-zA-Z])\1{2,}\b',r'\1\1',text)
    text=re.sub(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1',text)
    text=re.sub(r'[ ]{2,}',' ',text)
    return text

def replaceAtUser(text):
    """ Replaces "@user" with "atUser" """
    text = re.sub('@[^\s]+','atUser',text)
    return text

def replaceMultiToxicWords(text):
    text = re.sub(r'(fuckfuck)','fuck fuck ',text)
    text = re.sub(r'(f+)( *)([u|*|_]+)( *)([c|*|_]+)( *)(k)+','fuck',text)
    text = re.sub(r'(h+)(a+)(h+)(a+)','ha ha ',text)
    text = re.sub(r'(s+ *h+ *[i|!]+ *t+)','shit',text)
    text = re.sub(r'\b(n+)(i+)(g+)(a+)\b','nigga',text)
    text = re.sub(r'\b(n+)([i|!]+)(g+)(e+)(r+)\b','nigger',text)
    text = re.sub(r'\b(d+)(o+)(u+)(c+)(h+)(e+)( *)(b+)(a+)(g+)\b','douchebag',text)
    text = re.sub(r'([a|@][$|s][s|$])','ass',text)
    text = re.sub(r'(\bfuk\b)','fuck',text)
    return text

def removeNumbers(text):
    """ Removes integers """
    text = re.sub(r"(^|\W)\d+", " ", text)
    text = re.sub("5","s",text)
    text = re.sub("1","i",text)
    text = re.sub("0","o",text)
    return text
                  
def replaceMultiPunc(text):
    text=re.sub(r'([!])\1\1{2,}',r' mxm ',text)
    text=re.sub(r'([?])\1\1{2,}',r' mqm ',text)
    text=re.sub(r'([*])\1\1{2,}',r'*',text)
    return text


replace_pun = {}
separators = set('"%&\'()+,-./:;<=>@[\\]^_`{|}~')
for punc in separators:
    replace_pun[punc] = ' '
replace_pun['&']=' and '

def my_cleaner(s):
    #s = s.lower()
    s=replaceURL(s)
    s=removeUnicode(s)
    s=removeNumbers(s)
    s=replaceAbbrev(s)
    s=replaceMultiToxicWords(s)
    s=replaceMultiPunc(s)
    s=removeRepeatPattern(s)
    
    for punc in separators:
        s= s.replace(punc,replace_pun[punc])                   # remove & replace punctuations
    tokens = nltk.tokenize.word_tokenize(s)                    # split a string into words (tokens)
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

Clean all texts (train data, validation, and submission)

In [None]:
hspeech['text_clean'] = hspeech['text'].apply(my_cleaner)
validation['less_toxic'] = validation['less_toxic'].apply(my_cleaner)
validation['more_toxic'] = validation['more_toxic'].apply(my_cleaner)
submission['text'] = submission['text'].apply(my_cleaner)

# Vectorize the text with TF-IDF

The data is vectorized with analyzer 'char_wb'. Datasets such as toxic comments and unbias greatly benefit from this.

In [None]:
vec = TfidfVectorizer(lowercase=True, stop_words=['english'], analyzer='char_wb', ngram_range = (3,5))
X = vec.fit_transform(hspeech['text_clean'])
x_less_toxic =  vec.transform(validation['less_toxic'])
x_more_toxic = vec.transform(validation['more_toxic'])
x_test = vec.transform(submission['text'])

# Build 3 Ridge Models and build an Ensemble

Build three  ridges models with varying regularization parameters

In [None]:
model_1 = Ridge(alpha=0.5)
model_1.fit(X, hspeech['hate_speech_score'])
print(f'Model 1 validation accuracy score:  {(model_1.predict(x_less_toxic) < model_1.predict(x_more_toxic)).mean()}')

model_2 = Ridge(alpha=1)
model_2.fit(X, hspeech['hate_speech_score'])
print(f'Model 2 validation accuracy score:  {(model_2.predict(x_less_toxic) < model_2.predict(x_more_toxic)).mean()}')


model_3 = Ridge(alpha=2)
model_3.fit(X, hspeech['hate_speech_score'])
print(f'Model 3 validation accuracy score: {(model_3.predict(x_less_toxic) < model_3.predict(x_more_toxic)).mean()}')

# Average the model scores and submit

In [None]:
p1 = model_1.predict(x_test)
p2 = model_2.predict(x_test)
p3 = model_3.predict(x_test)

In [None]:
submission['score'] = (p1 + p2 + p3) / 3

In [None]:
import scipy
from scipy.stats import rankdata

test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test['score'] = rankdata(submission['score'].values, method='ordinal')
test[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
test.sort_values('score', ascending=False)

#trustyourcv!