## Introduction

I use data from two competitions:
* [Jigsaw Rate Severity of Toxic Comments (2021-2022)](http://https://www.kaggle.com/c/jigsaw-toxic-severity-rating)
* [Toxic Comment Classification Challenge (2018)](http://https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

I use data:
* comments_to_score.csv
* validation_data.csv
* train.csv

To extract tags from words I use

#### nltk.tag.pos_tag(tokens, tagset=None, lang='eng')

```
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]

>>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
[('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
```

https://www.nltk.org/api/nltk.tag.html

> This package contains classes and interfaces for part-of-speech tagging, or simply "tagging".  
> A "tag" is a case-sensitive string that specifies some property of a token, such as its part of speech.

# 1. Import & Set & Load

In [None]:
import os
import numpy as np
import pandas as pd

import calendar
import textwrap
import re
from string import punctuation
from bs4 import BeautifulSoup

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def create_valid_data(data: pd.DataFrame) -> pd.DataFrame:
    """ Create valid_data. """
    data = data.copy()
    
    data['id'] = data.index.astype(str) + '_' + data['worker'].astype(str)
  
    less_toxic = data[['id', 'less_toxic']].rename(
                    columns={'less_toxic': 'text'})

    more_toxic = data[['id', 'more_toxic']].rename(
                    columns={'more_toxic': 'text'})

    return pd.concat([less_toxic,more_toxic], ignore_index=True)


def get_short_data(data: pd.DataFrame, frac_n: "float or int", rs: int = None) -> pd.DataFrame:
    """ Get the selected piece of data. """
    result = data.copy()

    if not rs:
        rs = 1234

    if frac_n > 0 and frac_n < 1:
        result = result.sample(frac=frac_n, random_state=rs)
    elif frac_n > 1 and frac_n < 100:
        frac_n = frac_n / 100
        result = result.sample(frac=frac_n, random_state=rs)
    elif frac_n >= 100:
        result = result.sample(n=frac_n, random_state=rs)
    else:
        # 0 or 1
        raise ValueError("Invalid '{}' value!".format(frac_n))

    return result.sort_index()

In [None]:
option_random = 1234567
option_color = "green"
cm = sns.light_palette(option_color, as_cmap=True)
pd.set_option("max_colwidth", 90)

DEBUG_MODE = False # Cut raw data to speed up debugging

CUT_DATA = None
# None   :  all data without cutting
# < 100  :  0.1 or 10 equal 10% samples
# >= 100 :  100|1000  equal 100|1000 samples

SAVE_RESULT = True  # Saving results to csv format
REMOVE_DOUBLE = True # Removing doubles in raw text data
SKIP_STOP_WORDS = False  # stopwords.words('english')

In [None]:
path_comments_to_score = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"
path_validation_data = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
path_train_data = "./train.csv"  # *.zip Toxic Comment Classification Challenge

if not os.path.isfile(path_train_data):
    !unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

In [None]:
# Loading data and bringing them to a single format
raw_score_data = pd.read_csv(path_comments_to_score).rename(
                                columns={'comment_id': 'id'})
raw_train_data = pd.read_csv(path_train_data).rename(
                                columns={'comment_text': 'text'})

validation_data = pd.read_csv(path_validation_data)
raw_valid_data = create_valid_data(validation_data)

In [None]:
validation_data

In [None]:
raw_valid_data

In [None]:
print(raw_score_data.shape)
print(raw_valid_data.shape)
print(raw_train_data.shape)

# 2. Data & Text preprocessing

In [None]:
custom_stop_words = ['utc', 'wikipedia', 'wiki']

custom_stop_words = custom_stop_words \
                + [w.lower() for w in calendar.month_name[1:]] \
                + [w.lower() for w in calendar.month_abbr[1:]]


def text_preprocessor(text: str, max_str_len: int = None) -> str:
    """ Cutting and cleaning the text. """
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.strip()

    text = re.sub(' +', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'image|file|jpg|jpeg', '', text)
    # Cut IP-address
    text = re.sub(r'\d{1,4}\.\d{1,4}\.\d{1,4}\.\d{1,4}', '', text)
    # Cut time, period or year
    text = re.sub(r'\d{2,}[:|-]\d{2,}|\d{4}', '', text)
    # Cut 20th or 1st
    text = re.sub(r'\d{1,}[th|st]', '', text)
    # Cut money
    text = re.sub(r'\d{1,}[,|\.]\d{2,}', '', text)
    # Cut address (9/169)
    text = re.sub(r'\d{1,}/\d{1,}', '', text)
    
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text()
      
    words_cleaned = [w.strip(punctuation) for w in text.split() if not w.isdigit()]

    temp_list = []
    for word in words_cleaned:
        if word.isdigit():
            continue
        
        # "word!!!!!!word!!?!?!!"
        word_splitted = re.split('\?|!|:|;|\||\)|\(|\+|"|\.|,|#|&|_', word)
        
        if len(word_splitted) > 1:
            for w in word_splitted:
                w = w.strip(punctuation)
                if not w.isdigit():
                    temp_list.append(w)
        else:
            temp_list.append(word)

    # skip word "uhbsirtubgyihihlkjngkjbnkgjnbkf"
    max_word_len = 30
    words_cleaned = [w for w in temp_list if len(w) < max_word_len]

    # skip words with numbers
    words_cleaned = [w for w in words_cleaned if not bool(re.search(r'\d', w))]
    words_cleaned = [w for w in words_cleaned if bool(re.search(r"[a-zA-Z'\-]", w))]

    # skip one letter
    words_cleaned = [w for w in words_cleaned if len(w) > 1 or w == 'i']

    text = " ".join(words_cleaned)

    if max_str_len:
        text = textwrap.shorten(text, width=max_len, placeholder='')
        
    words_skipped = [w for w in text.split()
                         if w.lower() not in custom_stop_words]
    text = " ".join(words_skipped)
    
    return text

In [None]:
score_data = raw_score_data.copy()
valid_data = raw_valid_data.copy()
train_data = raw_train_data.copy()

if DEBUG_MODE:
    score_data = get_short_data(score_data, 300, option_random)
    valid_data = get_short_data(valid_data, 150, option_random)
    train_data = get_short_data(train_data, 300, option_random)
    
if REMOVE_DOUBLE:
    score_data = score_data.drop_duplicates(subset=['text'])
    valid_data = valid_data.drop_duplicates(subset=['text'])
    train_data = train_data.drop_duplicates(subset=['text'])
    
print(score_data.shape)
print(valid_data.shape)
print(train_data.shape)

In [None]:
# Using text_preprocessor
raw_text = """
28 July 2008 (UTC) 12:50 20.215.60.232 I w WHAT!!?!!!Oleg's bags name F-25 Duke1V 1,000 :-)
'"\n \n\nGjalexei, You asked about whether there is an ""anti-editorializing"" policy here.
There is, and it\'s called wikipedia:neutral point of view.  It discusses at some length...
the case of what we should do when writing about a subject which most of us find repugnant.
Theeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"'
"""

text_preprocessor(raw_text)

In [None]:
%%time
clean_score_data = score_data.copy()
clean_valid_data = valid_data.copy()
clean_train_data = train_data.copy()

clean_score_data['text'] = clean_score_data['text'].apply(text_preprocessor)
clean_valid_data['text'] = clean_valid_data['text'].apply(text_preprocessor)
clean_train_data['text'] = clean_train_data['text'].apply(text_preprocessor)

In [None]:
percentiles = [.05, .25, .5, .75, .85, .95]
pd.DataFrame({'score': clean_score_data['text'].str.len().describe(percentiles),
              'valid': clean_valid_data['text'].str.len().describe(percentiles),
              'train': clean_train_data['text'].str.len().describe(percentiles)}
).astype(int)

In [None]:
check_mask = clean_score_data['text'].str.len() < 20

print("\n=== SCORE DATA: Text columns before/after text preprocessing ===")
pd.DataFrame({'id': clean_score_data.loc[check_mask, 'id'],
              'raw_text': score_data.loc[check_mask, 'text'],
              'clean_text': clean_score_data.loc[check_mask, 'text']}
)

In [None]:
check_mask = clean_valid_data['text'].str.len() < 20

print("\n=== VALID DATA: Text columns before/after text preprocessing ===")
pd.DataFrame({'id': clean_valid_data.loc[check_mask, 'id'],
              'raw_text': valid_data.loc[check_mask, 'text'],
              'clean_text': clean_valid_data.loc[check_mask, 'text']}
)

In [None]:
check_mask = clean_train_data['text'].str.len() < 20

print("\n=== TRAIN DATA: Text columns before/after text preprocessing ===")
pd.DataFrame({'id': clean_train_data.loc[check_mask, 'id'],
              'raw_text': train_data.loc[check_mask, 'text'],
              'clean_text': clean_train_data.loc[check_mask, 'text']}
)

In [None]:
if SAVE_RESULT:
    clean_score_data.to_csv('clean_score_data.csv', index_label='index')
    clean_valid_data.to_csv('clean_valid_data.csv', index_label='index')
    clean_train_data.to_csv('clean_train_data.csv', index_label='index')
    
print(SAVE_RESULT)

# 3. Word & Tag extraction

In [None]:
stop_words = stopwords.words('english') \
                + ["can't", "i'm"]


def word_extractor(data: pd.Series, max_str_len: int = None, is_stopwords: bool = False) -> pd.DataFrame:
    """ Cut, clean and extract information from text. """
    words_and_tags = []
    
    for string in data.values:
        if max_str_len:
            string = textwrap.shorten(string, width=max_str_len, placeholder='')
            
        splitted_text = string.split()

        if is_stopwords:
            splitted_text = [w for w in splitted_text if w not in stop_words]
          
        words_and_tags.extend(pos_tag(splitted_text, tagset='universal'))
        
    result = pd.DataFrame.from_records(words_and_tags, columns=['word', 'tag'])

    result['tag'] = result['tag'].astype("category")
    
    return result

In [None]:
if not DEBUG_MODE and CUT_DATA:
    clean_score_data = get_short_data(clean_score_data, CUT_DATA, option_random)
    clean_valid_data = get_short_data(clean_valid_data, CUT_DATA, option_random)
    clean_train_data = get_short_data(clean_train_data, CUT_DATA, option_random)
    
    print(clean_score_data.shape)
    print(clean_valid_data.shape)
    print(clean_train_data.shape)

In [None]:
clean_text = "I'm busy You asked about whether there is an anti-editorializing policy"

# Using nltk.tag.pos_tag(tokens, tagset=None, lang='eng')
# tagset='universal'  < None, universal, wsj, brown
text_tagged = pos_tag(
    word_tokenize(
        textwrap.shorten(
            clean_text, width=70, placeholder=''
        )
    ),
    tagset='universal'
)

print(*text_tagged, " ...")

In [None]:
%%time
max_str_len = 500
words_and_tags = pd.concat(
    [word_extractor(clean_score_data['text'], max_str_len, SKIP_STOP_WORDS),
     word_extractor(clean_valid_data['text'], max_str_len, SKIP_STOP_WORDS),
     word_extractor(clean_train_data['text'], max_str_len, SKIP_STOP_WORDS)
    ], keys=['score', 'valid', 'train'],
       names=['data', 'index']
)

In [None]:
words_and_tags['tag'] = words_and_tags['tag'].astype("category")
words_and_tags = words_and_tags.reset_index(level='data').reset_index(drop=True)
words_and_tags['data'] = words_and_tags['data'].astype("category")

words_and_tags.info(memory_usage='deep')

In [None]:
words_and_tags

In [None]:
if SAVE_RESULT:
    words_and_tags.to_csv('words_and_tags.csv', index=False)
    
print(SAVE_RESULT)

# 4. Tag & Word analysis

In [None]:
def tags_info(data: pd.DataFrame, is_norm: bool = False, is_all: bool = False, is_style: bool = True) -> pd.DataFrame:
    """ Tabular information about tags. """
    data_col = "data"
    tags_col = "tag"
    
    if is_norm:
        is_norm = 'columns'
    
    result = pd.crosstab(data[tags_col], data[data_col],
                         normalize=is_norm, margins=is_all)

    if is_norm:
        result = result.mul(100).round(2)
    
    if is_all and 'All' in result.index:
        result = result.drop(['All'], axis=0)
    
    if 'All' in result.columns:
        result = result.sort_values(by='All', ascending=False)
    
    if is_style and not is_norm and 'All' in result.columns:
        result = result.style.bar(subset=['All'], color=option_color)

    if is_style and not is_norm and 'All' not in result.columns:
        result = result.style.background_gradient(cmap=cm)

    return result


def tags_plot(data: pd.DataFrame, figsize: tuple = (12, 6)) -> plt.figure:
    """ Visualization of information about tags. """
    plot_data = data
    col_name = "data"
    hue_name = "tag"
    
    hue_order = plot_data[hue_name].value_counts(ascending=False) \
                               .index.to_list()
    
    plt.figure(figsize=figsize)
    sns.histplot(y=col_name, hue=hue_name, data=plot_data,
                 hue_order=hue_order,
                 multiple='fill', shrink=.75)
    plt.title("Frequency of using tags in datasets")
    plt.ylabel("")
    plt.xlabel("")
    plt.show()

## 4.1. Tags info

In [None]:
tags_info(words_and_tags)

In [None]:
tag_types = tags_info(words_and_tags).index.to_list()
max_num_words = 20

for x_tag in tag_types:
    x_data = words_and_tags.loc[words_and_tags.tag == x_tag, 'word']
    print()
    print("Tag:", x_tag)
    print("Nunique:", x_data.nunique())
    print("Sample words:", *x_data.values[:max_num_words])

In [None]:
# The number of mentions of the tag
tags_info(words_and_tags, is_all=True)

In [None]:
# Relative (each data) number of mentions of the tag
tags_info(words_and_tags, is_norm=True, is_all=True)

In [None]:
tags_plot(words_and_tags)

## 4.2. Words info

In [None]:
def popular_words(data: pd.DataFrame, n: int = 10, is_norm: bool = False) -> pd.DataFrame:
    """ Tabular information about words. """
    data_col = "data"
    word_col = "word"
    
    if is_norm:
        is_norm = 'columns'
    
    result = pd.crosstab(data[word_col], data[data_col],
                         normalize=is_norm, margins=True)

    if is_norm:
        result = result.mul(100).round(2)
    
    if 'All' in result.index:
        result = result.drop(['All'], axis=0)
    
    return result.nlargest(n, 'All')

In [None]:
popular_words(words_and_tags, 15)

In [None]:
popular_words(words_and_tags, 10, is_norm=True)

In [None]:
for x_tag in tag_types:
    if x_tag in [".", "X"]:
        continue
    
    x_data = words_and_tags.loc[words_and_tags.tag == x_tag]
    print(f"Tag: {x_tag}")
    print("Frequency of mentioning the words (TOP-10) in percent")
    display(popular_words(x_data, is_norm=True))

# 5. Stop words

In [None]:
# text_preprocessor()
print(len(custom_stop_words))
print(*custom_stop_words)

In [None]:
# word_extractor()
print(len(stop_words))
print(*stop_words)