In [None]:
# Download train dataset: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data?select=train.csv.zip

In [2]:
import pandas as pd

def parse_label(row):
    return 'toxic' if row["toxic"] or row["severe_toxic"] or row["obscene"] or row["threat"] or row["insult"] or row["identity_hate"] else 'nontoxic'

def create_label(original_file, new_file):
    df = pd.read_csv(original_file, sep=',')
    df['label'] = df.apply (lambda row: parse_label(row), axis=1)  
    df.to_csv(new_file)

create_label('toxicity/train.csv', 'toxicity/train_ok.csv')

In [4]:
import numpy as np
import os

from tflite_model_maker import model_spec
from tflite_model_maker import text_classifier
from tflite_model_maker.config import ExportFormat
from tflite_model_maker.text_classifier import AverageWordVecSpec
from tflite_model_maker.text_classifier import DataLoader

import tensorflow as tf
assert tf.__version__.startswith('2')
tf.get_logger().setLevel('ERROR')

In [6]:
spec = model_spec.get('average_word_vec')

In [16]:
train_data, test_data = DataLoader.from_csv(
      filename='toxicity/train_ok.csv',
      text_column='comment_text',
      label_column='label',
      model_spec=spec,
      is_training=True).split(0.8)

In [17]:
model = text_classifier.create(train_data, model_spec=spec, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 256, 16)           160048    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 34        
Total params: 160,354
Trainable params: 160,354
Non-trainable params: 0
_________________________________________________________________


In [19]:
loss, acc = model.evaluate(test_data)



In [20]:
model.export(export_dir='average_word_vec_toxicity')

In [9]:
!python -m tf2onnx.convert --opset 13 --tflite average_word_vec/model.tflite --output average_word_vec/model.onnx

2021-09-18 00:43:12,328 - INFO - Using tensorflow=2.6.0, onnx=1.10.1, tf2onnx=1.9.2/0f28b7
2021-09-18 00:43:12,328 - INFO - Using opset <onnx, 13>
2021-09-18 00:43:12,366 - INFO - Optimizing ONNX model
2021-09-18 00:43:12,394 - INFO - After optimization: Const -2 (7->5), Identity -1 (1->0), Transpose -2 (2->0)
2021-09-18 00:43:12,396 - INFO - 
2021-09-18 00:43:12,396 - INFO - Successfully converted TensorFlow model average_word_vec/model.tflite to ONNX
2021-09-18 00:43:12,396 - INFO - Model inputs: ['input_1']
2021-09-18 00:43:12,397 - INFO - Model outputs: ['Identity']
2021-09-18 00:43:12,397 - INFO - ONNX model is saved at average_word_vec/model.onnx


In [29]:
model.model_spec.save_vocab('vocab.txt')