# Import & Def & Load

In [None]:
import numpy as np
import pandas as pd
pd.set_option("max_colwidth", 80)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
def count_duplicates(data: pd.Series) -> pd.DataFrame:
    """ Get information about duplicates. """
    result = data.value_counts()
    
    result = pd.DataFrame({'count': result.values,
                           'text': result.index})

    result.index += 1
    
    return result


def toxic_rate(string: str) -> float:
    """ Get a text toxicity score. """
    result = 0
    
    sid_score = sid.polarity_scores(string)
    
    neg_value = sid_score.get('neg')
    
    if neg_value:
        result = neg_value
    
    return result

In [None]:
validation_data_path = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
valid_data = pd.read_csv(validation_data_path)
valid_data.shape

# 1. Duplicates in rows

In [None]:
is_duplicate = valid_data.duplicated(subset=['less_toxic', 'more_toxic'])

print("Duplicates: {} %".format(round(is_duplicate.mean() * 100, 2)))

In [None]:
print("With duplicates:   ", valid_data.shape[0])
print("Count duplicates:  ", valid_data[is_duplicate].shape[0])
print("Without duplicates:", valid_data[~is_duplicate].shape[0])

In [None]:
valid_data[is_duplicate].head(14)

In [None]:
valid_data[is_duplicate &
           valid_data['less_toxic'] \
                .str.contains('Straw poll being conducted on Catholic Church')]

In [None]:
valid_data[~is_duplicate &
           valid_data['less_toxic'] \
                .str.contains('Straw poll being conducted on Catholic Church')]

In [None]:
n_joined = count_duplicates(valid_data['less_toxic'] \
                            + " "
                            + valid_data['more_toxic'])
n_joined

In [None]:
n_joined['count'].value_counts()

# 2. Duplicates in columns

## 2.1. Column 'less_toxic'

In [None]:
print("Duplicates: {} %".format(
                round(valid_data['less_toxic'].duplicated() \
                          .mean() * 100, 2)))

In [None]:
n_less_toxic = count_duplicates(valid_data['less_toxic'])
n_less_toxic

In [None]:
valid_data[valid_data['less_toxic'].str.contains('How many sockpuppets do you have?')]

In [None]:
n_less_toxic['count'].hist(bins=13, figsize=(12, 5));

In [None]:
n_less_toxic['count'].value_counts(normalize=True) \
                        .mul(100).round(2).map("{} %".format)

## 2.2. Column 'more_toxic'

In [None]:
print("Duplicates: {} %".format(
                round(valid_data['more_toxic'].duplicated() \
                          .mean() * 100, 2)))

In [None]:
n_more_toxic = count_duplicates(valid_data['more_toxic'])
n_more_toxic

In [None]:
valid_data[valid_data['more_toxic'].str.contains('ALL NIGHT BITCH')]

In [None]:
n_more_toxic['count'].hist(bins=14, figsize=(12, 5));

In [None]:
n_more_toxic['count'].value_counts(normalize=True) \
                        .mul(100).round(2).map("{} %".format)

# 3. Check score with/without duplicates

### Sample usage for sentiment

> from nltk.sentiment.vader import SentimentIntensityAnalyzer  

https://www.nltk.org/howto/sentiment.html

In [None]:
sid = SentimentIntensityAnalyzer()
sid.polarity_scores("How many sockpuppets do you have?")

## 3.1. Score with duplicates

In [None]:
%%time
toxic_data = valid_data.copy()

toxic_data['less_toxic'] = toxic_data['less_toxic'].apply(toxic_rate)
toxic_data['more_toxic'] = toxic_data['more_toxic'].apply(toxic_rate)

toxic_score = toxic_data.eval('less_toxic < more_toxic').mean()

print("\nSCORE: {}\n".format(round(toxic_score, 5)))

## 3.2. Score without duplicates

In [None]:
%%time
toxic_data = valid_data[~is_duplicate].copy()

toxic_data['less_toxic'] = toxic_data['less_toxic'].apply(toxic_rate)
toxic_data['more_toxic'] = toxic_data['more_toxic'].apply(toxic_rate)

toxic_score = toxic_data.eval('less_toxic < more_toxic').mean()

print("\nSCORE: {}\n".format(round(toxic_score, 5)))