# Classification of toxic comments

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
import seaborn as sns
#sns.set(style = 'white')
import nltk
#import string
from textblob import TextBlob
import string
import re
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 100)

In [None]:
import transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification,\
                              AdamW, get_linear_schedule_with_warmup, DistilBertConfig
import torch
from pylab import rcParams
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import tensorflow as tf
import datasets 

RANDOM_SEED = 331
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

#https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

In [None]:
#df=pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
df=pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
#test_df=pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
df.sample(5)

In [None]:
df.columns

There is six targets to be predicted: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

We can create one multi-label-classification model or six binary-classification models.

# Target analysis

Here we will study the ditribution of the data on each target

In [None]:
df.toxic.value_counts()

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(20, 7))

sns.histplot(data=df, x="toxic", ax=axs[0, 0])
sns.histplot(data=df, x="severe_toxic",ax=axs[0, 1])
sns.histplot(data=df, x="obscene", ax=axs[0, 2])
sns.histplot(data=df, x="threat", ax=axs[1, 0])
sns.histplot(data=df, x="insult", ax=axs[1, 1])
sns.histplot(data=df, x="identity_hate", ax=axs[1, 2])

plt.show()

In [None]:
df0=pd.concat([df[(df.toxic==1)],
              df[(df.toxic==0)].sample(15294)
             ])

fig, axs = plt.subplots(2, 3, figsize=(20, 7))

sns.histplot(data=df0, x="toxic", ax=axs[0, 0])
sns.histplot(data=df0, x="severe_toxic",ax=axs[0, 1])
sns.histplot(data=df0, x="obscene", ax=axs[0, 2])
sns.histplot(data=df0, x="threat", ax=axs[1, 0])
sns.histplot(data=df0, x="insult", ax=axs[1, 1])
sns.histplot(data=df0, x="identity_hate", ax=axs[1, 2])

plt.show()

## Text analysis

In [None]:
from nltk import word_tokenize

df['len_tokenized_sents'] = df.apply(lambda row: len(word_tokenize(row['comment_text'])), axis=1)
sns.histplot(data=df, x="len_tokenized_sents")

In [None]:
df["comment_text"][np.argmax(df['len_tokenized_sents'])]

we need a to preprocess the text

# Text preprocessing

In [None]:
def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)

    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


def clean(data, col):
    
    data[col] = data[col].str.replace('https?://\S+|www\.\S+', ' social medium ', regex=True)      
        
    data[col] = data[col].str.lower()
    data[col] = data[col].str.replace("4", "a") 
    data[col] = data[col].str.replace("2", "l")
    data[col] = data[col].str.replace("5", "s") 
    data[col] = data[col].str.replace("1", "i") 
    data[col] = data[col].str.replace("!", "i") 
    data[col] = data[col].str.replace("|", "i", regex=False) 
    data[col] = data[col].str.replace("0", "o") 
    data[col] = data[col].str.replace("l3", "b") 
    data[col] = data[col].str.replace("7", "t") 
    data[col] = data[col].str.replace("7", "+") 
    data[col] = data[col].str.replace("8", "ate") 
    data[col] = data[col].str.replace("3", "e") 
    data[col] = data[col].str.replace("9", "g")
    data[col] = data[col].str.replace("6", "g")
    data[col] = data[col].str.replace("@", "a")
    data[col] = data[col].str.replace("$", "s", regex=False)
    data[col] = data[col].str.replace("#ofc", " of fuckin course ")
    data[col] = data[col].str.replace("fggt", " faggot ")
    data[col] = data[col].str.replace("your", " your ")
    data[col] = data[col].str.replace("self", " self ")
    data[col] = data[col].str.replace("cuntbag", " cunt bag ")
    data[col] = data[col].str.replace("fartchina", " fart china ")    
    data[col] = data[col].str.replace("youi", " you i ")
    data[col] = data[col].str.replace("cunti", " cunt i ")
    data[col] = data[col].str.replace("sucki", " suck i ")
    data[col] = data[col].str.replace("pagedelete", " page delete ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("i'm", " i am ")
    data[col] = data[col].str.replace("offuck", " of fuck ")
    data[col] = data[col].str.replace("centraliststupid", " central ist stupid ")
    data[col] = data[col].str.replace("hitleri", " hitler i ")
    data[col] = data[col].str.replace("i've", " i have ")
    data[col] = data[col].str.replace("i'll", " sick ")
    data[col] = data[col].str.replace("fuck", " fuck ")
    data[col] = data[col].str.replace("f u c k", " fuck ")
    data[col] = data[col].str.replace("shit", " shit ")
    data[col] = data[col].str.replace("bunksteve", " bunk steve ")
    data[col] = data[col].str.replace('wikipedia', ' social medium ')
    data[col] = data[col].str.replace("faggot", " faggot ")
    data[col] = data[col].str.replace("delanoy", " delanoy ")
    data[col] = data[col].str.replace("jewish", " jewish ")
    data[col] = data[col].str.replace("sexsex", " sex ")
    data[col] = data[col].str.replace("allii", " all ii ")
    data[col] = data[col].str.replace("i'd", " i had ")
    data[col] = data[col].str.replace("'s", " is ")
    data[col] = data[col].str.replace("youbollocks", " you bollocks ")
    data[col] = data[col].str.replace("dick", " dick ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("mothjer", " mother ")
    data[col] = data[col].str.replace("cuntfranks", " cunt ")
    data[col] = data[col].str.replace("ullmann", " jewish ")
    data[col] = data[col].str.replace("mr.", " mister ", regex=False)
    data[col] = data[col].str.replace("aidsaids", " aids ")
    data[col] = data[col].str.replace("njgw", " nigger ")
    data[col] = data[col].str.replace("wiki", " social medium ")
    data[col] = data[col].str.replace("administrator", " admin ")
    data[col] = data[col].str.replace("gamaliel", " jewish ")
    data[col] = data[col].str.replace("rvv", " vanadalism ")
    data[col] = data[col].str.replace("admins", " admin ")
    data[col] = data[col].str.replace("pensnsnniensnsn", " penis ")
    data[col] = data[col].str.replace("pneis", " penis ")
    data[col] = data[col].str.replace("pennnis", " penis ")
    data[col] = data[col].str.replace("pov.", " point of view ", regex=False)
    data[col] = data[col].str.replace("vandalising", " vandalism ")
    data[col] = data[col].str.replace("cock", " dick ")
    data[col] = data[col].str.replace("asshole", " asshole ")
    data[col] = data[col].str.replace("youi", " you ")
    data[col] = data[col].str.replace("afd", " all fucking day ")
    data[col] = data[col].str.replace("sockpuppets", " sockpuppetry ")
    data[col] = data[col].str.replace("iiprick", " iprick ")
    data[col] = data[col].str.replace("penisi", " penis ")
    data[col] = data[col].str.replace("warrior", " warrior ")
    data[col] = data[col].str.replace("loil", " laughing out insanely loud ")
    data[col] = data[col].str.replace("vandalise", " vanadalism ")
    data[col] = data[col].str.replace("helli", " helli ")
    data[col] = data[col].str.replace("lunchablesi", " lunchablesi ")
    data[col] = data[col].str.replace("special", " special ")
    data[col] = data[col].str.replace("ilol", " i lol ")
    data[col] = data[col].str.replace(r'\b[uU]\b', 'you', regex=True)
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'s", " is ", regex=False)
    data[col] = data[col].str.replace(r"\'ve", " have ", regex=False)
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ", regex=False)
    data[col] = data[col].str.replace(r"\'d", " would ", regex=False)
    data[col] = data[col].str.replace(r"\'ll", " will ", regex=False)
    data[col] = data[col].str.replace(r"\'scuse", " excuse ", regex=False)
    data[col] = data[col].str.replace('\s+', ' ', regex=True)  # will remove more than one whitespace character
#     text = re.sub(r'\b([^\W\d_]+)(\s+\1)+\b', r'\1', re.sub(r'\W+', ' ', text).strip(), flags=re.I)  # remove repeating words coming immediately one after another
    data[col] = data[col].str.replace(r'(.)\1+', r'\1\1', regex=True) # 2 or more characters are replaced by 2 characters
#     text = re.sub(r'((\b\w+\b.{1,2}\w+\b)+).+\1', r'\1', text, flags = re.I)
    data[col] = data[col].str.replace("[:|♣|'|§|♠|*|/|?|=|%|&|-|#|•|~|^|>|<|►|_]", '', regex=True)
    
    
    data[col] = data[col].str.replace(r"what's", "what is ")    
    data[col] = data[col].str.replace(r"\'ve", " have ", regex=False)
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ", regex=False)
    data[col] = data[col].str.replace(r"i'm", "i am ", regex=False)
    data[col] = data[col].str.replace(r"\'re", " are ", regex=False)
    data[col] = data[col].str.replace(r"\'d", " would ", regex=False)
    data[col] = data[col].str.replace(r"\'ll", " will ", regex=False)
    data[col] = data[col].str.replace(r"\'scuse", " excuse ", regex=False)
    data[col] = data[col].str.replace(r"\'s", " ", regex=False)

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3', regex=True)
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1', regex=True)    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ', regex=True)    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1', regex=True)
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1', regex=True)
    data[col] = data[col].str.replace(r'[ ]{2,}',' ', regex=True).str.strip()   
    data[col] = data[col].str.replace(r'[ ]{2,}',' ', regex=True).str.strip()   
    tqdm.pandas()
    data[col] = data[col].progress_apply(text_cleaning)
    data[col] = data[col].apply(lambda x: x.lower())
    return data

In [None]:
#test_df=clean(test_df,'text')
df=clean(df, 'comment_text')

In [None]:
df['len_tokenized_sents'] = df.apply(lambda row: len(word_tokenize(row['comment_text'])), axis=1)
sns.histplot(data=df, x="len_tokenized_sents")

In [None]:
df["comment_text"][np.argmax(df['len_tokenized_sents'])]

In [None]:
len(df[df['len_tokenized_sents']>200])/len(df)

# prepare data for training

In [None]:
df=pd.concat([df[(df.toxic==1)],
              df[(df.toxic==0)].sample(15294)
             ])
df['target']=df['toxic']

df.reset_index(inplace=True)
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=42)
df_valid, df_test = train_test_split(df_valid, test_size=0.5, random_state=42)

train_ds = datasets.Dataset.from_pandas(df_train[['comment_text','target']])
valid_ds = datasets.Dataset.from_pandas(df_valid[['comment_text','target']])

checkpoint = "distilbert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["comment_text"], truncation=True)

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_valid_ds = valid_ds.map(tokenize_function, batched=True)

batch_size = 16

data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

tf_train_ds = tokenized_train_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["target"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

tf_valid_ds = tokenized_valid_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["target"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,
)

# NLP model, BERT

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

In [None]:
num_epochs = 2
num_train_steps = len(tf_train_ds) * num_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.summary()

In [None]:
fit_history = model.fit(tf_train_ds,
                        epochs=1,
                        validation_data=tf_valid_ds,
                        verbose=1)

In [None]:
model.save_pretrained("./model_identity_hate")

# Test model

In [None]:
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('../input/d/taoufikelkhaouja/my-model/toxic')

In [None]:
test_ds = datasets.Dataset.from_pandas(df_test)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)
tf_test_ds = tokenized_test_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,
)

In [None]:
raw_result = model.predict(tf_test_ds)
result = raw_result.logits

In [None]:
df_test['score'] = tf.math.sigmoid(result)[:, 0]
df_test['score'] = df_test['score'].apply(lambda x: 0 if x<0.5 else 1)

In [None]:
from sklearn.metrics import f1_score

f1_score(df_test['target'],df_test['score'])

# hhh

In [None]:
def tokenize_test_function(example):
    return tokenizer(example["text"], truncation=True)
test_ds = datasets.Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(tokenize_test_function, batched=True)
tf_test_ds = tokenized_test_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,
)

In [None]:
raw_result = model.predict(tf_test_ds)
result = raw_result.logits
test_df['pred'] = tf.math.sigmoid(result)[:, 0]
test_df['pred'] = test_df['score'].apply(lambda x: 0 if x<0.5 else 1)

In [None]:
tf.math.sigmoid(result)

In [None]:
test_df['score'] = tf.math.sigmoid(result)[:, 0]
test_df['score'] = test_df['score'].apply(lambda x: 0 if x<0.5 else 1)
submission_df = test_df[['comment_id', 'score']]


In [None]:
test_df['score'].sort_values()

In [None]:
test_df.sort_values('score')

In [None]:
submission_df.to_csv("submission1.csv", index=False)
sub=submission_df
sub.to_csv('submitions.csv')

In [None]:
def weighting(target):
    if target<0.1:
        return 1/1.25
    elif target<0.2:
        return 1/0.15
    elif target<0.3:
        return 1/0.1
    elif target<0.4:
        return 1/0.07
    else:
        return 0

df['weights']=df['target'].apply(weighting)
df.weights=df.weights/df.weights.sum()

df_0=df[df.target<=0.4]
df_1=df[df.target>0.4]
df_0=df_0.sample(n=700000, weights=df_0.weights)
df=pd.concat([df_0,df_1])
del df_0
del df_1

for i in range(100):
    l=len(df[(df.target<(i+1)/100) & (df.target>=(i)/100)])
    df=pd.concat([df[(df.target>=(i+1)/100) | (df.target<(i)/100)],
                  df[(df.target<(i+1)/100) & (df.target>=(i)/100)].sample(n=min(l,5000))
                  ])

In [None]:
df=pd.concat([df[(df.toxic==1)&(df.identity_hate==1)],
              df[(df.toxic==1)&(df.threat==0)].sample(2000),
              df[(df.toxic==0)&(df.threat==0)].sample(300),
              df[(df.toxic==0)&(df.threat==1)],
             ])

df['target']=df.identity_hate

#d={'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
#df['target']=0
#for k in d:
#    df['target'] = df.target + d[k]*df[k]

In [None]:
d={'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
df['target']=0
for k in d:
    df['target'] = df.target + d[k]*df[k]

In [None]:
d={'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
df['target']=0
for k in d:
    df['target'] = df.target + d[k]*df[k]