In [None]:
# Installations
!pip install -q nltk
!pip install -q WordCloud
!pip install -q plotly
!pip install -q transformers
!pip install -q twython

In [None]:
import pandas as pd
import numpy as np
import os

#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#NLP
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords
from wordcloud  import WordCloud, STOPWORDS
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

#Modelling

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

from kaggle_datasets import KaggleDatasets

In [None]:
os.listdir('/kaggle/working')

###### Code Header
- **Notebook Name**: Multilingual Toxicity Scoring
- **Author(s)**: Vallabh Reddy
- **Date**: 13th May 2020
- **Edits to be made**:
- **Additions Planned**:
- **Workflow Plan**: 
    - Import data and preliminary setup
    - Inspect datsets
    - Visualize dataset properties
    - Follow text preprocessing steps such as stemming, lemmatization, case generalization
    - Wrangle the text datasets to extract unigrams, bigrams and trigrams ( Does 
    - Visualize the top used n-grams for toxicity and non-toxicity through word clouds and other means
    - Investigate need for any other text representations required like tfidf, word vectors etc
    - Pick models, train models. Should I train them only in English? Or would translating to other languages and then training models on that data help? Instead I could just translate test to english and then pass into model
    - Use validation dataset to tune hyperparameters
    - Test models on test dataset after translating
    - Investigate value of ensembling
- **Notes to Self**:
    - How do I deal with spelling mistakes? Is there a way to coerce words to the right spelling using sentence context? Explore existing text analysis models for this.
    

### Acknowledgments

1. [VADER(Valence Aware Dictionary and sEntiment Reasoner)](https://pypi.org/project/vaderSentiment/) - The NLTK package contains the VADER tool which allows us to score the sentiment 
2. [HuggingFace's Transformers and Tokenizers](https://huggingface.co/transformers/) - HuggingFace has a collection of pretrained NLP models to pick from including Facebook's RoBERTa and Google's BERT. The same package comes with tokenizers to preprocess the text for these models
3. [Jigsaw TPU: XLM-RoBERTa](https://www.kaggle.com/xhlulu/jigsaw-tpu-xlm-roberta) ~ Xhlulu

## Contents:
- [About This Project](#Problem-Space)
- [About The Datasets](#about-datasets)
- [Setup](#Setup)
    - [Wrangling](#Wrangling)
    - [EDA](#eda)
- [Modelling](#Modelling)
    - [1. XLM-RoBERTa](#Roberta)

## About this project <a class="anchor" id="Problem-Space"></a>

The Conversation AI team, a research initiative founded by Google and Jigsaw, is tasked with improving the vigilance against online toxicity in conversation. The goal of [this competition](https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification/overview) is to be able to predict the toxicity of multilingual comments using only English comments as our training data. 

Excerpts from the competition are given below.

>It only takes one toxic comment to sour an online discussion. The Conversation AI team, a research initiative founded by Jigsaw and Google, builds technology to protect voices in conversation. A main area of focus is machine learning models that can identify toxicity in online conversations, where toxicity is defined as anything rude, disrespectful or otherwise likely to make someone leave a discussion. If these toxic contributions can be identified, we could have a safer, more collaborative internet.
>
>In the previous 2018 Toxic Comment Classification Challenge, Kagglers built multi-headed models to recognize toxicity and several subtypes of toxicity. In 2019, in the Unintended Bias in Toxicity Classification Challenge, you worked to build toxicity models that operate fairly across a diverse range of conversations. This year, we're taking advantage of Kaggle's new TPU support and challenging you to build multilingual models with English-only training data.
>
>Jigsaw's API, Perspective, serves toxicity models and others in a growing set of languages (see our documentation for the full list). Over the past year, the field has seen impressive multilingual capabilities from the latest model innovations, including few- and zero-shot learning. We're excited to learn whether these results "translate" (pun intended!) to toxicity classification. Your training data will be the English data provided for our previous two competitions and your test data will be Wikipedia talk page comments in several different languages.
>
>As our computing resources and modeling capabilities grow, so does our potential to support healthy conversations across the globe. Develop strategies to build effective multilingual models and you'll help Conversation AI and the entire industry realize that potential.
>
>*Disclaimer: The dataset for this competition contains text that may be considered profane, vulgar, or offensive.*

### About the datasets <a class = 'anchor' id = 'about-datasets'></a>
We are given the following datasets.

**Training set 1**
- *Comment_text*: Contains the string that stores the comment.
- *Toxic*: A boolean value, 1 = toxic, 0 = non-toxic.

**Training set 2**: Has 'Comment_text' and 'toxic' similar to Training Set 1, but the 'Toxic' column is a probability. Also has several other descriptor probabilities.

**Validation Set**
- *Comment_text*: Same as Training Set 1.
- *Toxic*: Same as Training Set 1.
- *Lang*: Two letter representation of the language of the comment. 'es'= Espaniol, 'it' = Italian etc.

**Test Set**
- *Comment_text*: Same as Training Set 1.
- *Lang*: same as Validation Set.
- Does not have a 'Toxic' flag and we are tasked with predicting it.


## Setup <a class="anchor" id="Setup"></a>

In [None]:
# Importing all the required datasets
dir_path = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'

train_data_1 = pd.read_csv(dir_path + "/jigsaw-toxic-comment-train.csv")
train_data_2 = pd.read_csv(dir_path + "/jigsaw-unintended-bias-train.csv")
validation_data = pd.read_csv(dir_path + "/validation.csv")
test_data = pd.read_csv(dir_path + "/test.csv")

In [None]:
train_data_1.shape

In [None]:
train_data_1.head()

In [None]:
train_data_2.shape

In [None]:
train_data_2.head()

In [None]:
validation_data.shape

In [None]:
validation_data.head()

In [None]:
test_data.shape

In [None]:
test_data.head()

### Wrangling <a class="anchor" id="Wrangling"></a>

In [None]:
# In the second dataset, the toxicity is not 1 or 0 but instead a probability, we will round it to convert to a 1/0 column
train_data_2.toxic = train_data_2.toxic.round().astype(int)

# We combined the entire training set 1 with all the toxic comments of training set 2 and 200k non-toxic comments from set 2
train_data = pd.concat([
                train_data_1[['comment_text','toxic']]
                , train_data_2[['comment_text','toxic']].query('toxic == 1')
                , train_data_2[['comment_text', 'toxic']].query('toxic == 0').sample(n = 200000, random_state = 1993)
                ])


In [None]:
train_data.shape

In [None]:
sns.countplot(train_data.toxic)

In [None]:
sns.countplot(validation_data.toxic)

In [None]:
word_tokenize(train_data.comment_text[1])


In [None]:
#count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3))

In [None]:
#trial = count_vectorizer.fit_transform(train_data.comment_text)

In [None]:
#trial.shape

In [None]:
# Splitting up the comment into single words
text_words = word_tokenize(train_data.comment_text[1])
# Converting to lower case
text_words = [word.lower() for word in text_words]

In [None]:
modified_stopwords = stopwords.words('english')
modified_stopwords.remove('not')
#Removing stopwords and sumbols
text_words = [word for  word in text_words if not word in modified_stopwords and word.isalpha()]
len(text_words)

In [None]:
text_words

In [None]:
train_sample = train_data.sample(n = 10000, random_state = 1993)
train_sample = train_sample.reset_index(drop = True)

### EDA <a class = 'anchor'  id ='eda'></a>


In [None]:
# Filtering comment text column, removing newline characters and filtering out unexpected data types from the column
def nan_filter(x):
    if type(x) == str:
        return (x.replace("\n", "")).lower()
    else:
        return ""

nontoxic_text = ' '.join([nan_filter(comment) for comment in train_sample.query('toxic==0')['comment_text']])
toxic_text = ' '.join([nan_filter(comment) for comment in train_sample.query('toxic == 1')['comment_text']])

In [None]:
wordcloud = WordCloud(max_font_size=300
                      , background_color='white'
                      , stopwords = modified_stopwords
                      , collocations=True
                      , max_words = 100
                      , width=1200
                      , height=1000).generate(nontoxic_text)

fig = px.imshow(wordcloud)

fig.update_layout(title_text='Non-Toxic Word Cloud(with bigrams)')

In [None]:
wordcloud = WordCloud(max_font_size=300
                      , background_color='white'
                      , stopwords = modified_stopwords
                      , collocations=False
                      , max_words = 100
                      , width=1200
                      , height=1000).generate(nontoxic_text)

fig = px.imshow(wordcloud)

fig.update_layout(title_text='Non-Toxic Word Cloud(unigrams)')

In [None]:
wordcloud = WordCloud(max_font_size=300
                      , background_color='white'
                      , stopwords = modified_stopwords
                      , collocations=True
                      , width=1200
                      , max_words = 100
                      , height=1000).generate(toxic_text)

fig = px.imshow(wordcloud)

fig.update_layout(title_text='Toxic Word Cloud(with bigrams)')

In [None]:
wordcloud = WordCloud(max_font_size=300
                      , background_color='white'
                      , stopwords = modified_stopwords
                      , collocations=False
                      , max_words = 100
                      , width=1200
                      , height=1000).generate(toxic_text)

fig = px.imshow(wordcloud)

fig.update_layout(title_text='Toxic Word Cloud(unigrams)')

It's interesting to note that in the toxic word cloud we see both 'hate' and 'like' as high frequency unigrams. On further inspection, I realized 'like' is used more often to compare the subject to something derogatory. "You're acting like a ..." , " You're just like .." etc and less often in the sense "I like ...".

In [None]:
train_sample.iloc[1,0]

In [None]:
# Comment size visualizations

def text_len(x):
    if type(x) is str:
        return len(x.split())
    else:
        return 0
    

train_sample['comment_size'] = train_sample.comment_text.apply(text_len)

toxic_text_lengths = train_sample.query('toxic == 1 and comment_size < 200') ['comment_size'].sample(frac = 1, random_state = 1993)
nontoxic_text_lengths = train_sample.query('toxic == 0 and comment_size < 200')['comment_size'].sample(frac = 1, random_state = 1993)


In [None]:
plt.figure(figsize=(13,5))
ax = sns.distplot(toxic_text_lengths)
plt.title('Toxic Comment Lengths')
plt.xlabel('Comment Length')
plt.xticks(np.arange(0,210,10))
plt.yticks(np.arange(0,0.025,0.0025));

In [None]:
plt.figure(figsize=(13,5))
ax = sns.distplot(nontoxic_text_lengths)
plt.title('Non-Toxic Comment Lengths')
plt.xlabel('Comment Length')
plt.xticks(np.arange(0,210,10))
plt.yticks(np.arange(0,0.025,0.0025));

### Sentiment Scores using VADER
We'll try scoring the sentiment of the comments using the VADER component of NLTK. Here is an [article](http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html) that expands on the procedure, it works better with social media content than general approaches. Here's a link to the original team's [paper](http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf).

Note that the negative sentiment here is not the same as the toxicity we are looking for. Negativity might simply be portrayal of discontent, which is not toxic.

In [None]:
def sentiment(x):
    if type(x) is str:
        return SIA.polarity_scores(x)
    else:
        return 1000

SIA = SentimentIntensityAnalyzer()
train_sample['polarity'] = train_sample.comment_text.apply(sentiment)
# Vader outputs 4 scores, Negative, Neutral, Positive and Compound


In [None]:
train_sample.query('toxic == 0').head(10)

In [None]:
train_sample.query('toxic==1').head(10)

In [None]:
# This comment has a negative score of 0 despite clearly being toxic.
train_sample.comment_text[22]

This comment has a negative score of 0 despite clearly being toxic.

On first look, it appears that VADER does not recognize negative terms when the writer masks characters with \*.

In [None]:
train_sample['negativity'] = train_sample.polarity.apply(lambda x: x['neg'])
train_sample['positivity'] = train_sample.polarity.apply(lambda x: x['pos'])

### Comparing the Negativity Score with Toxicity

In [None]:
nontoxic_negativity = train_sample.query('toxic == 0').sample(frac = 1, random_state = 1993)['negativity']
toxic_negativity = train_sample.query('toxic == 1').sample(frac = 1, random_state = 1993)['negativity']

plot = ff.create_distplot([nontoxic_negativity, toxic_negativity]
                           , group_labels = ['Non-Toxic', 'Toxic']
                           , colors = ['Green', 'Red']
                           , show_hist= False)
plot.update_layout(title_text = 'Negativity vs Toxicity'
                   , xaxis_title = 'Negativity'
                   , xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 0.1))

plot.show()

The negativity score seems to be able to differentiate the toxic comments from the non toxic to a certain extent. With a greater share of non-toxic comments having lower negativity and many toxic comments having at least a slight negative sentiment around 0.1-0.3

### Comparing the Positivity Score with Toxicity

In [None]:
nontoxic_positivity = train_sample.query('toxic == 0').sample(frac = 1, random_state = 1993)['positivity']
toxic_positivity = train_sample.query('toxic == 1').sample(frac = 1, random_state = 1993)['positivity']

plot = ff.create_distplot([nontoxic_positivity, toxic_positivity]
                          , group_labels=['Non-Toxic', 'Toxic']
                          , colors = ['Green', 'Red']
                          , show_hist= False)

plot.update_layout( title_text = 'Positivity vs Toxicity'
                    , xaxis_title = 'Positivity'
                    , xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 0.1))
plot.show()

Though we see that non-toxic comments have more observations at higher positivity levels, the positivity score does seems to be able differentiate toxic from non-toxic very well.

### Modelling <a class=anchor id='Modelling'></a>

### Model 1: XLM RoBERTa <a class = 'anchor' id = 'Roberta'></a>

In [None]:
roberta_string = 'jplu/tf-xlm-roberta-large'

#### Encoding
We must first encode our data before we feed it to the model. The reason we must encode it is because ML algorithms cannot directly interpret text. They are designed to work on numbers. So we must convert our text input to numbers in a manner such that the algorithm can interpret it and we also maintain the sequence of the text as it is integral to the interpretation of the text as a whole. 

Here is a brief [article](https://towardsdatascience.com/text-encoding-a-review-7c929514cccf) to read on encoding.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(roberta_string)

Let's take a look at the first two comments and their encoded values

In [None]:
print('Comment 1: - ' + '\n\n' + 
      train_sample.comment_text.values[0])

In [None]:
print('Comment 2: - ' + '\n\n' + 
      train_sample.comment_text.values[1])

Below is an array which contains the encoded versions of both these comments

In [None]:
sample_encoded = tokenizer.batch_encode_plus(train_sample.comment_text.values[0:2]
                                    , return_attention_masks=False
                                   , return_token_type_ids=False
                                   , pad_to_max_length=True
                                   , max_length = 512)
sample_encoded

The result is a dictionary with a single key because we disabled the rest by setting the return parameters to False. The value of this key is a list of 2 lists. One for each comment in our input.

We see that each comment is now a sequence of numeric values followed by a long list of 1's. These numbers are all actually indices in the transformer's vocabulary. The 1 actually represents '0' in the dictionary and the reason of the long trailing zeroes is something called 'zero-padding' which is done to set all input text to an equal size as the model requires the input to be so. This size is the 'max_length' value of 512 we set in the tokenizer. No string can be larger than this length.

We only need the numeric values but get a dictionary, let's create a function the encodes, and returns the values in our desired format as we need to repeat this a few times.

In [None]:
def encode(text, max_len = 512):
    encoded_dict = tokenizer.batch_encode_plus(text
                               , return_attention_masks=False
                               , return_token_type_ids=False
                               , pad_to_max_length=True
                               , max_length = max_len)
    return np.array(encoded_dict['input_ids'])

MAX_LEN = 192

Let's setup the TPU config as the model training operations can get compute intensive

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync



Now let us continue with the encoding

In [None]:
# We split up the datasets into X and y as we will train the model to predict target y's using feature sets X's
X_train = encode(train_data.comment_text.values, MAX_LEN)
X_valid = encode(validation_data.comment_text.values, MAX_LEN)
X_test = encode(test_data.content.values, MAX_LEN)

# target datasets don't need to be encoded since these are toxicity flag values of 0 and 1 for each comment
y_train = train_data.toxic.values
y_valid = validation_data.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(roberta_string)
    
    input_word_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer_layer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
model.summary()

In [None]:
n_steps = X_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = X_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

In [None]:
submission = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
submission['toxic'] = model.predict(test_dataset, verbose = 1)
submission.to_csv('/kaggle/working/submission.csv', index = False)