## Multi-label classification with simple keras transformer
Training set is based on the data from previous competitions that are available in this dataset: https://www.kaggle.com/julian3833/jigsaw-toxic-comment-classification-challenge
<br>The target is a sum of initial toxic labels from the dataset and ranges from 0 to 6. Final score calculated as dot product of labels and their probabilities for each comment.
<br>The model obtained from the 'Keras Code examples' section.

Credits:
* https://www.kaggle.com/steubk/jrsotc-ridgeregression 
* https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768
* https://www.kaggle.com/devkhant24/jigsaw-comment-toxicity-gru/
* https://keras.io/examples/nlp/text_classification_with_transformer/

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import re
import unidecode
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split

seed = 42
train_set = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
train_set.head(3)

In [None]:
# Function for cleaning comments
def clean_data(data):
    final = []
    for sent in data:
        sent = sent.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
        soup = BeautifulSoup(sent, "html.parser")
        sent = soup.get_text(separator=" ")
        remove_https = re.sub(r'http\S+', '', sent)
        sent = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
        sent = unidecode.unidecode(sent)
        sent = sent.lower()
        sent = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', sent) 
        sent = re.sub(r"[:$-,()%.?!]+", ' ',sent)
        stoplist = stopwords.words("english")
        sent = [word for word in word_tokenize(sent) if word not in stoplist]
        sent = " ".join(sent)
        final.append(sent)
    
    return final

#### The most of the comments in loaded dataset are non-toxic. Only 'n' of them are used for training.

In [None]:
train_set['toxicity'] = train_set.drop(['id', 'comment_text'], axis=1).sum(axis=1)
train_set.toxicity.value_counts()

In [None]:
# Use only 'n' non-toxic comments
n = 15000

nontoxic_sample = train_set[train_set.toxicity==0].sample(n, random_state = seed)
train = pd.concat([train_set[train_set.toxicity!=0], nontoxic_sample]).sort_index()
train = train[['comment_text', 'toxicity']]
train.toxicity.value_counts()

In [None]:
train['comment_text'] = clean_data(train.comment_text)
test['text'] = clean_data(test.text)

max_sequence_len = 250

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.comment_text)
total_words = len(tokenizer.word_index)+1

X = tokenizer.texts_to_sequences(train.comment_text)
X = pad_sequences(X, maxlen = max_sequence_len, padding='pre')

x_test = tokenizer.texts_to_sequences(test.text)
x_test = pad_sequences(x_test, maxlen = max_sequence_len)

y = train.toxicity.astype(np.int8)#.clip(0,4)

In [None]:
non_toxic_tokenizer = Tokenizer()
non_toxic_tokenizer.fit_on_texts(train[train.toxicity==0].comment_text)
non_toxic_count = non_toxic_tokenizer.word_counts
non_toxic_count = sorted(dict(non_toxic_count).items(), key=lambda tup: tup[1], reverse=True)

toxic_tokenizer = Tokenizer()
toxic_tokenizer.fit_on_texts(train[train.toxicity>0].comment_text)
toxic_count = toxic_tokenizer.word_counts
toxic_count = sorted(dict(toxic_count).items(), key=lambda tup: tup[1], reverse=True)[:200]

all_words = pd.DataFrame(toxic_count).merge(pd.DataFrame(non_toxic_count), how='left', on=0)
all_words = all_words.dropna().reset_index()

all_words['ratio'] = np.log(all_words['1_x'] / all_words['1_y'])

In [None]:
import plotly.express as px

fig = px.scatter(all_words.iloc[:100], x="1_y", y="1_x", text=0, log_x=True, log_y=True, color="ratio", color_continuous_scale='Portland',
                labels={
                     "1_x": "Number of word's appearance in toxic comments",
                     "1_y": "Number of word's appearance in non-toxic comments"})

fig.update_traces(textposition='top center')

fig.update_layout(
    height=800,
    title_text='100 most frequent words in toxic comments',
coloraxis_showscale=False)

fig.show()

In [None]:
embedding_dim = 256  # Embedding size for each token.
num_heads = 4  # Number of attention heads
ff_dim = 384 #  Hidden layer size in feedforward network.
batch_size = 128  # Batch size.
classes = len(y.unique())


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embedding_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embedding_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [None]:
inputs = layers.Input(shape=(max_sequence_len,))
embedding_layer = TokenAndPositionEmbedding( max_sequence_len, total_words, embedding_dim)
x = embedding_layer(inputs)
x = TransformerBlock(embedding_dim, num_heads, ff_dim)(x)
x = TransformerBlock(embedding_dim, num_heads, ff_dim)(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(classes, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.RMSprop(learning_rate = 7e-5), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
tf.random.set_seed(seed)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=seed)

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 7,restore_best_weights = True)
model.fit(X_train, y_train, validation_data = (X_val, y_val),
    epochs = 40, 
    batch_size = batch_size, 
    shuffle = True,
    callbacks = [early_stopping]
)

In [None]:
### validate
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

df_val['less_toxic'] = clean_data(df_val['less_toxic'])
df_val['more_toxic'] = clean_data(df_val['more_toxic'])

X_less_toxic = tokenizer.texts_to_sequences(df_val['less_toxic'])
X_more_toxic = tokenizer.texts_to_sequences(df_val['more_toxic'])

X_less_toxic = pad_sequences(X_less_toxic, maxlen = max_sequence_len)
X_more_toxic = pad_sequences(X_more_toxic, maxlen = max_sequence_len)

p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

p1 = (np.linspace(0,classes-1,classes) * p1).sum(axis=1)
p2 = (np.linspace(0,classes-1,classes) * p2).sum(axis=1)

# Validation Accuracy
(p1< p2).mean()

In [None]:
x_test = tokenizer.texts_to_sequences(test.text)
x_test = pad_sequences(x_test, maxlen = max_sequence_len)

In [None]:
preds = model.predict(x_test)
preds = (np.linspace(0,classes-1,classes) * preds).sum(axis=1)

In [None]:
# Making submission file

final = pd.DataFrame()
final["comment_id"] = test["comment_id"]
final["score"] = rankdata(preds)
final.to_csv("submission.csv", index=False)