In [None]:
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
CONFIG = {"seed": 2022,
          "n_fold": 5,
          "text_cleaning": True,
          "w_toxic": [1, 1, 0.25]  # set weights for "toxic", "severe_toxic", "toxic_ind"
          }

# Text cleaning 

In [None]:
def preprocess_text(text, lower_case=True, clean_text=CONFIG["text_cleaning"]):
    
    if lower_case:
        text = text.lower()
    
    # Remove website links
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    
    # Remove HTML tags
    template = re.compile(r'<[^>]*>') 
    text = template.sub(r'', text)
    
    # Remove none ascii characters
    template = re.compile(r'[^\x00-\x7E]+') 
    text = template.sub(r'', text)
    
    # Replace none printable characters
    template = re.compile(r'[\x00-\x0F]+') 
    text = template.sub(r' ', text)
    
    if clean_text:
        # Remove special characters
        text = re.sub("'s", '', text)
        template = re.compile('["#$%&\'()\*\+-/:;<=>@\[\]\\\\^_`{|}~]') 
        text = template.sub(r' ', text)
        # Replace multiple punctuation 
        text = re.sub('[.!?]{2,}', '.', text)
        text = re.sub(',+', ',', text) 
        # Remove numbers
        text = re.sub('\d+', ' ', text) 
        
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Remove spaces at the beginning and at the end of string
    text = text.strip() 

    return text

# Create Folds

In [None]:
def make_folds(df, label_col_name, label_value, num_folds=3):
    
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=CONFIG['seed'])
    for fold, ( _, val_) in enumerate(skf.split(X=df, y=(df[label_col_name] == label_value))):
        df.loc[val_ , "kfold"] = int(fold)
    
    df["kfold"] = df["kfold"].astype(int)
    
    return df

# Preprocessing the validation data

In [None]:
# load validation data 
valid_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv",)
print("Number of validation exaples:", valid_df.shape[0])

# Preprocess text 
valid_df["less_toxic"] = valid_df["less_toxic"].map(lambda com : preprocess_text(com))
valid_df["more_toxic"] = valid_df["more_toxic"].map(lambda com : preprocess_text(com))

valid_df.head()

In [None]:
valid_df.to_csv("valid_data.csv", index=False)

# Preprocessing the test data

In [None]:
# load test data 
test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv",)
print("Number of test exaples:", test_df.shape[0])
# Preprocess text 
test_df["text"] = test_df["text"].map(lambda com : preprocess_text(com))

test_df.head()


In [None]:
test_df.to_csv("test_data.csv", index=False)

# Preprocessing first Kaggle competition dataset

In [None]:
# load train data 
train1_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip", compression="zip")
print("Number of training exaples:", train1_df.shape[0])

# load test data
test_lbl_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip", compression="zip")
test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip", compression="zip")
test_df = test_df.merge(test_lbl_df)
print("Number of testing exaples:", test_df.shape[0])

# concat datasets and drop rows without toxicity rating or comment text
train1_df = pd.concat([test_df, train1_df], ignore_index=True)
train1_df.drop(columns=["id"], inplace=True)
train1_df = train1_df[train1_df.toxic >= 0]
train1_df.reset_index(inplace=True,drop=True) 
print("Final number of training exaples:", train1_df.shape[0], "\n")

train1_df.describe().iloc[:3,]

## Make target value 

In [None]:
# Lets look at the distribution of toxic and sever-toxic comments
toxic_count = train1_df.groupby(["toxic", "severe_toxic"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

In [None]:
# Look at the distribution of the other toxicity indicators 
train1_df['toxic_ind'] = train1_df.obscene + train1_df.threat + train1_df.insult + train1_df.identity_hate
toxic_count = train1_df.groupby(["toxic", "severe_toxic", "toxic_ind"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

In [None]:
# Define toxicity indicator
def add_label_column(df):
    df['toxic_ind'] = (df.obscene + df.threat + 
                              df.insult + df.identity_hate
                              ).map(lambda x: min(x,3))


    # Define the target value
    df = df.assign(y =  CONFIG["w_toxic"][0] * df.toxic + 
                                 CONFIG["w_toxic"][1] * df.severe_toxic +
                                 CONFIG["w_toxic"][2] * df.toxic_ind )
    df["y"] = df["y"] / max(df['y'])

    return df

train1_df = add_label_column(train1_df)
train1_df.loc[train1_df["y"]>0, "y"].hist(bins=50)

toxic_count = train1_df.groupby(["y"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

## Dataset preprocessing

In [None]:
train1_df = train1_df[["comment_text", "y"]]

# Preprocess text 
train1_df["comment_text"] = train1_df["comment_text"].map(lambda com : preprocess_text(com))
train1_df = train1_df.loc[train1_df["comment_text"] != ''].reset_index(drop=True)
# Make folds 
train1_df = make_folds(train1_df, "y", 0, num_folds=CONFIG["n_fold"])


## Save DataFrame

In [None]:
train1_df.to_csv("train1_data.csv", index=False)

# Preprocessing second Kaggle competition dataset

In [None]:
# load training data 
train2_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
train2_df = train2_df[["id", "comment_text", "target", "severe_toxicity", "obscene",
                       "identity_attack", "insult", "threat"]]
train2_df.rename(columns={"target":"toxic",
                          "severe_toxicity":"severe_toxic",
                          "identity_attack": "identity_hate"}, inplace=True)
print("Number of training exaples:", train2_df.shape[0], "\n")

train2_df.head()

## Make target value 

In [None]:
# Lets look at the distribution of toxic and sever-toxic comments
train2_int_df = train2_df[["toxic", "severe_toxic", "obscene",
                           "identity_hate", "insult", "threat"]].applymap(lambda x: 0 if x <= 0 else 1)
train2_int_df = pd.concat((train2_df[["id", "comment_text"]], train2_int_df), axis=1)
toxic_count = train2_int_df.groupby(["toxic", "severe_toxic"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

In [None]:
# Look at the distribution of the other toxicity indicators 
train2_int_df['toxic_ind'] = train2_int_df.obscene + train2_int_df.threat + train2_int_df.insult + train2_int_df.identity_hate
toxic_count = train2_int_df.groupby(["toxic", "severe_toxic", "toxic_ind"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

In [None]:
train2_df = add_label_column(train2_df)
y_poz = train2_df.loc[train2_df["y"]>0, "y"]
y_poz.hist(bins=150)


## Dataset preprocessing

In [None]:
train2_df = train2_df[["comment_text", "y"]]

# Preprocess text 
train2_df["comment_text"] = train2_df["comment_text"].map(lambda com : preprocess_text(com))
# Make folds 
train2_df = make_folds(train2_df, "y", 0, num_folds=CONFIG["n_fold"])


In [None]:
train2_df.to_csv("train2_data.csv", index=False)

# Preprocessing Ruddit dataset

In [None]:
# load training data 
train3_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
train3_df = train3_df[["txt", "offensiveness_score"]]
train3_df.rename(columns={"offensiveness_score":"y",
                          "txt": "comment_text"}, inplace=True)
print("Number of training exaples:", train3_df.shape[0], "\n")

train3_df.head()

train3_df["y"].hist(bins= 50)

In [None]:
# Preprocess text 
train3_df["comment_text"] = train3_df["comment_text"].map(lambda com : preprocess_text(com))

# Make one fold
train3_df["kfold"] = 0

In [None]:
train3_df.to_csv("train3_data.csv", index=False)