## Disclamer:
### Explicit content and rude language is present in the dataset

In [None]:
import re
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm.notebook import tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 5);
sns.set_style('whitegrid')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
train_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
train_df.tail(3)
test_df.tail(3)

In [None]:
train_df.duplicated(["less_toxic", "more_toxic"]).sum()
(train_df.duplicated(["less_toxic", "more_toxic"]).sum())/train_df.shape[0]

#### *Incredible, almost half of all comment pairs are duplicates! Let's use this fact to shorten our dataset.*

In [None]:
train_df = train_df.drop_duplicates(["less_toxic", "more_toxic"]).reset_index()
train_df.shape

In [None]:
letter_patterns = {
    "3": "e",
    "4": "for",
    " u ": " you ",
    "n't": " not"
}

def remove_unprintables(text):
    for symbol in ["\n", "\r", "\t"]:
        text = text.replace(symbol, " ")
    return re.sub(" +", " ", text)

def fix_words(text, patterns=letter_patterns):
    for key in patterns.keys():
        text = text.replace(key, patterns[key])
    return text

def tokenize(text, lower=True):
    for char in ",.~!@#$%^&*()_=-[]{}\"'":
        text = text.replace(char, " ")
    text = remove_unprintables(text)
    tokens = [x for x in text.split(" ") if len(x) != 0]
    tokens = tokens if not lower else [x.lower() for x in tokens]
    return tokens

def count_caps_words(text):
    tokens = tokenize(text, lower=False)
    count = [1 for x in tokens if t.isupper()]
    return count.sum()

def count_sticky_keys(text):
    count, mem_char = 0, None
    for i in range(2, len(text)-2):
        if text[i] == mem_char:
            continue
        if text[i] == text[i-1] == text[i-2] and text[i] == text[i+1] == text[i+2]:
            count += 1
            mem_char = text[i]
    return count

In [None]:
c = 0
for i in range(train_df.shape[0]):
    if train_df["less_toxic"][i] in train_df["more_toxic"].tolist():
        c += 1
c

In [None]:
def find_least_toxic():
    start_idxes = []
    for i in tqdm(range(train_df.shape[0]), total=train_df.shape[0]):
        text = train_df["less_toxic"][i]
        if text in train_df["more_toxic"].tolist() and text not in train_df.drop(
            i)["less_toxic"].tolist():
            start_idxes.append(i)
            
    return start_idxes

def find_more_toxic(idx):
    text = train_df.loc[idx, "more_toxic"]
#     print(text)
    candidates = train_df[train_df.less_toxic == text]
    return candidates.index.tolist()

def find_all_chains():
    chains = []
    least_toxic_idx = find_least_toxic()
    
    for idx in tqdm(least_toxic_idx):
        df = train_df.copy()
#         df = train_df.drop(idx)
        chain = [idx]
        
        while df.shape[0] > 0:
#             print(idx)
            next_idx = find_more_toxic(idx)
            df = df.drop(idx)
            if len(next_idx) > 0:
                chain.append(next_idx[0])
                try:
                    idx = [x for x in next_idx if x in df.index.tolist()][0]
                except Exception as e:
                    break
            else:
                break
            
        if len(chain) > 1:
            chains.append(list(dict.fromkeys(chain)))
            
    return list(set(tuple(x) for x in chains))

In [None]:
chains = find_all_chains()

In [None]:
len(chains)

#### *Let's look at some relative ratings. We might encounter some inconsistancies due to, well, some subjectivety of the task*

In [None]:
for i in range(15):
    chain = chains[-i]
#     print(chain)
    string = train_df.loc[chain[0], "less_toxic"]
    for idx in chain:
        string += "\n{more toxic} " + train_df.loc[idx, "more_toxic"].replace("\n", " ")
    string
    "-"*30

*Work in progress*