# TF-IDF Vectorization with Keras


Thanks for the notebook from 
- https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768 
- https://www.kaggle.com/steubk/jrsotc-ridgeregression

[julian3833](https://www.kaggle.com/julian3833) and [steubk](https://www.kaggle.com/steubk) both show a good way of TF-IDF Vectorization with SKLearn Models such as Ridge Regression and Naive Bayes. Recently I was thinking of a way to use TF-IDF vectorization with Keras, so that we can work with different kinds of Neural Network,  luckily I find a way to do it with keras `TextVectorization` layer.

I will also build a multi-label classification Model and train with all labels from [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge). After training, I will use the Model to predict probability of all labels and multiply them with weights to generate a final result for calcuate ranking of toxicity.

I am also keeping several models and use their final results to calcuate final score.

In [None]:
import pandas as pd
from scipy.stats import rankdata
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
import sklearn 
import os
from sklearn import model_selection
from sklearn import metrics

In [None]:
class Config:
    vocab_size = 20000
    batch_size = 256
    epochs = 50
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate']
    label_weights = [1, 2, 2, 5, 1, 2]
    ouput_dataset_path = "../input/tfidf-vectorization-with-keras-output"
    best_acc_path = "model_best_acc.tf"
    best_auc_path = "model_best_auc.tf"
    best_loss_path = "model_best_loss.tf"
    latest_path = "model_latest.tf"
    model_paths = [best_acc_path, best_auc_path, latest_path, best_loss_path]
    modes = ["training", "inference"]
    mode = modes[0]
config = Config()

## Prepare the data


In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df = df.rename(columns={'comment_text': 'text'})
df.head()

## TF-IDF vectorization

In [None]:
X = df["text"]
text_vectorizer = layers.TextVectorization(max_tokens=config.vocab_size, output_mode="tf-idf", ngrams=2)
# Index the bigrams and learn the TF-IDF weights via `adapt()`
with tf.device("CPU"):
    # A bug that prevents this from running on GPU for now.
    text_vectorizer.adapt(X)

In [None]:
y = df[config.labels]
y.describe()

The output of the vectorizer:

In [None]:
sample = text_vectorizer(X[0:config.batch_size])
sample.shape

## Model Development

In [None]:
model = keras.Sequential([
        keras.Input(shape=(None, ), dtype="string"),
        text_vectorizer,
        layers.Dense(256, activation="relu", kernel_regularizer="l2"),
        layers.Dense(32, activation="relu", kernel_regularizer="l2"),
        layers.Dense(len(config.labels), activation="sigmoid")
    ])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["categorical_accuracy", keras.metrics.AUC()])
model.summary()

In [None]:
keras.utils.plot_model(model)

## Train Validation Split

In [None]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

## Model Training

In [None]:
if config.mode == config.modes[0]:
    model_best_acc_checkpoint = keras.callbacks.ModelCheckpoint(config.best_acc_path, save_best_only=True, save_weights_only=True, monitor="val_categorical_accuracy")
    model_best_auc_checkpoint = keras.callbacks.ModelCheckpoint(config.best_auc_path, save_best_only=True, save_weights_only=True, monitor="val_auc")
    model_best_loss_checkpoint = keras.callbacks.ModelCheckpoint(config.best_loss_path, save_best_only=True, save_weights_only=True, monitor="val_loss")
    reduce_lr = keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5)
    model.fit(X_train, y_train, epochs=config.epochs, batch_size=config.batch_size, validation_data=(X_val, y_val), callbacks=[model_best_acc_checkpoint, model_best_auc_checkpoint, model_best_loss_checkpoint, reduce_lr])
    model.save_weights(config.latest_path)

## Model Evluation


In [None]:
def evaluate(model, model_path, X_val, y_val):
    print("Evaluation of %s"%(model_path))
    path = model_path
    if config.mode == config.modes[1]:
        path = os.path.join(config.ouput_dataset_path, path)
    model.load_weights(path)
    result = np.array(model.predict(X_val) > 0.5, dtype=int)
    for i in range(len(config.labels)):
        cls_report = metrics.classification_report(y_val[config.labels[i]], result[:, i])
        print("Classification Report of %s"%config.labels[i])
        print(cls_report)

In [None]:
for path in config.model_paths:
    evaluate(model, path, X_val, y_val)

# Submission

In [None]:
scores = []
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
for path in [config.best_acc_path, config.best_auc_path, config.latest_path, config.best_loss_path]:
    if config.mode == config.modes[1]:
        path = os.path.join(config.ouput_dataset_path, path)
    model.load_weights(path)
    score = model.predict(df_sub["text"], batch_size=config.batch_size)
    score = np.sum(score * np.array(config.label_weights), axis=1)
    scores.append(score)
score = np.mean(scores, axis=0)
df_sub['score'] = rankdata(score, method='ordinal')
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)
df_sub.head()


<font color="red" size="5">If you found it useful and would like to back me up, just upvote.</font>

