# Jigsaw Toxicity Word2Vec+TFIDF Inference
## Table of Contents
* [1. Overview](#1.)
* [2. Configuration](#2.)
* [3. Setup](#3.)
* [4. Model](#4.)
    * [4.1 FNet Encoder](#4.1)
    * [4.2 Positional Embedding](#4.2)
    * [4.3 FNet Classification Model](#4.3)
    * [4.4 Model Training](#4.4)
* [5. Submission](#5.)
* [6. References](#6.)

<font color="red" size="3">If you found it useful and would like to back me up, just upvote.</font>

<a id="1."></a>
## 1. Overview
This is inference Notebook, for training notebook please see [here](https://www.kaggle.com/lonnieqin/jigsaw-toxicity-word2vec-tfidf-training).

<a id="2."></a>
## 2. Configuration

In [None]:
class Config:
    vocab_size = 15000 # Vocabulary Size
    sequence_length = 100 # Length of sequence
    batch_size = 1024
    embed_dim = 256
    latent_dim = 256
    base_model_path = "../input/jigsaw-word2vec-tfidf-model/"
    best_auc_model_path = "model_best_auc.tf"
    best_acc_model_path = "model_best_acc.tf"
    lastest_model_path = "model_latest.tf"
config = Config()

<a id="3."></a>
## 3. Setup

In [None]:
import pandas as pd
import tensorflow as tf
import pathlib
import random
import string
import re
import sys
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import os
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer 
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.stats import rankdata
import json

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    text = tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )
    text = tf.strings.regex_replace(text, f"[0-9]+", " ")
    text = tf.strings.regex_replace(text, f"[ ]+", " ")
    text = tf.strings.strip(text)
    return text

In [None]:
validation_data = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
train = train[["comment_text", "toxic"]]
train.columns = ["text", "label"]
# Add More toxic data to mitigate class imbalance problem
train = train.append(pd.DataFrame({"text": validation_data["more_toxic"], "label": [1] * len(validation_data)}))
tfidf_vectozier = layers.TextVectorization(
    standardize=custom_standardization, 
    max_tokens=config.vocab_size, 
    output_mode="tf-idf", 
    ngrams=2
)
word2vec_vectozier = layers.TextVectorization(
    standardize=custom_standardization, 
    max_tokens=config.vocab_size, 
    output_sequence_length=config.sequence_length
)
with tf.device("CPU"):
    tfidf_vectozier.adapt(list(train["text"]))
    word2vec_vectozier.adapt(list(train["text"]))

<a id="4."></a>
## 4. Model

<a id="4.1"></a>
### 4.1 FNet Encoder

In [None]:
class FNetEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, dropout_rate=0.1, **kwargs):
        super(FNetEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        #self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs):
        # Casting the inputs to complex64
        inp_complex = tf.cast(inputs, tf.complex64)
        # Projecting the inputs to the frequency domain using FFT2D and
        # extracting the real part of the output
        fft = tf.math.real(tf.signal.fft2d(inp_complex))
        proj_input = self.layernorm_1(inputs + fft)
        proj_output = self.dense_proj(proj_input)
       
        layer_norm = self.layernorm_2(proj_input + proj_output)
        #output = self.dropout(layer_norm)
        return layer_norm

<a id="4.2"></a>
### 4.2 Positional Embedding

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


<a id="4.3"></a>
### 4.3 Classification Model

In [None]:
def get_word2vec_model(config, inputs):
    x = word2vec_vectozier(inputs)
    x = PositionalEmbedding(config.sequence_length, config.vocab_size, config.embed_dim)(x)
    x = FNetEncoder(config.embed_dim, config.latent_dim)(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0)(x)
    for i in range(3):
        x = layers.Dense(100, activation="relu")(x)
        x = layers.Dropout(0)(x)
    return x

In [None]:
def get_tfidf_model(config, inputs):
    x = tfidf_vectozier(inputs)
    x = layers.Dense(256, activation="relu", kernel_regularizer="l2")(x)
    x = layers.Dense(100, activation="relu", kernel_regularizer="l2")(x)
    return x

In [None]:
def get_model(config):
    inputs = keras.Input(shape=(None, ), dtype="string", name="inputs")
    word2vec_x = get_word2vec_model(config, inputs)
    tfidf_x = get_tfidf_model(config, inputs)
    x = layers.Concatenate()([word2vec_x, tfidf_x])
    output = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, output, name="model")
    return model

In [None]:
model = get_model(config)

In [None]:
model.summary()

Let's visualize the Model.

In [None]:
keras.utils.plot_model(model, show_shapes=True)

### Validation Accuracy

In [None]:
def inference(model, paths, data):
    scores = []
    for path in paths:
        model.load_weights(config.base_model_path + path)
        score = model.predict(data).reshape(-1)
        scores.append(score)
    return np.mean(scores, axis=0)

In [None]:
validation_data.head()

In [None]:
paths = [config.best_acc_model_path, config.best_auc_model_path, config.lastest_model_path]

In [None]:
less_pred = inference(model, paths, validation_data["less_toxic"])

In [None]:
more_pred = inference(model, paths, validation_data["more_toxic"])

In [None]:
(less_pred < more_pred).mean()

<a id="5."></a>
## 5. Submission

In [None]:
test = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sample_submission = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv")
test_ds = tf.data.Dataset.from_tensor_slices((test["text"])).batch(config.batch_size).cache().prefetch(1)
scores = []
#for path in [config.best_auc_model_path]:
#for path in [config.best_acc_model_path, config.best_auc_model_path]:
for path in paths:
    model.load_weights(config.base_model_path + path)
    score = model.predict(test_ds).reshape(-1)
    scores.append(score)
score = np.mean(scores, axis=0)
print(score.shape)
sample_submission["score"] = rankdata(score, method='ordinal')
sample_submission.to_csv("submission.csv", index=False)
sample_submission.head()


<a id="6."></a>
## 6. References
- [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824v3)
- [Attention Is All You Need](https://arxiv.org/abs/1706.03762v5)
- [Text Generation using FNet](https://keras.io/examples/nlp/text_generation_fnet/)
- [English-Spanish Translation: FNet](https://www.kaggle.com/lonnieqin/english-spanish-translation-fnet)