In [None]:
!pip install tensorflowjs

Importing all the necessary dependencies

In [22]:
import tensorflow as tf
import tensorflowjs as tfjs
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing the dataset

In [3]:
rawDataset = pd.read_csv('/content/drive/MyDrive/Codes/finalData.csv', index_col=[0])

In [4]:
rawDataset['text'] = rawDataset['text'].values.astype('str')

Tokenizing the word vocab

In [5]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.oov_token = '<shobdobhandarErBahire>'
tokenizer.fit_on_texts(rawDataset['text'])
vocab = tokenizer.word_index
vocabSize = len(vocab)
vocabSize

217799

Testing the tokenized words

Sequencing the whole dataset and padding to uniform shape

In [6]:
textData = tokenizer.texts_to_sequences(rawDataset['text'])
paddedTextData = tf.keras.preprocessing.sequence.pad_sequences(textData, padding='pre')
paddedTextData.shape, paddedTextData

((184354, 1403), array([[    0,     0,     0, ...,    17,   669,    75],
        [    0,     0,     0, ...,    21,  7480,  1142],
        [    0,     0,     0, ...,    16, 16598,  7024],
        ...,
        [    0,     0,     0, ...,   548,   794,  9428],
        [    0,     0,     0, ...,     3,    38,    57],
        [    0,     0,     0, ...,     5, 17461,    82]], dtype=int32))

Splitting into training and testing datasets

In [7]:
trainLength = 150000


xTrain = paddedTextData[:trainLength]
yTrain = rawDataset['is_offensive'].to_numpy()[:trainLength]

xTest = paddedTextData[trainLength:]
yTest = rawDataset['is_offensive'].to_numpy()[trainLength:]

In [8]:
yTest.shape, xTest.shape

((34354,), (34354, 1403))

Defining the model

In [9]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocabSize, 64, input_length=1403))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Training the model

In [10]:
model.fit(xTrain, yTrain, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0d100611d0>

Evaluating on testing dataset

In [11]:
model.evaluate(xTest, yTest)



[0.1267794668674469, 0.9559294581413269]

Real life testing

In [16]:
texts = ["We are doing our project", "you are a piece of shit", "you bloody bitch", "good night brother"]

sequencedText = tokenizer.texts_to_sequences(texts)

paddedText = tf.keras.preprocessing.sequence.pad_sequences(sequencedText, padding='pre', maxlen=1403)
print(paddedText)
model.predict(paddedText)

[[   0    0    0 ...  268  175  467]
 [   0    0    0 ...  661    5  236]
 [   0    0    0 ...    7 2414  126]
 [   0    0    0 ...   98 1104 1893]]


array([[0.106747  ],
       [0.9902535 ],
       [0.99791414],
       [0.05730198]], dtype=float32)

In [None]:
text = input()
sequencedText = tokenizer.texts_to_sequences([text])

paddedText = tf.keras.preprocessing.sequence.pad_sequences(sequencedText, padding='pre', maxlen=1403)
print(paddedText)
model.predict(paddedText)

In [23]:
tfjs.converters.save_keras_model(model, '/content/model')

In [28]:
!zip -r /content/file.zip /content/model

  adding: content/model/ (stored 0%)
  adding: content/model/group1-shard9of14.bin (deflated 7%)
  adding: content/model/model.json (deflated 72%)
  adding: content/model/group1-shard10of14.bin (deflated 7%)
  adding: content/model/group1-shard1of14.bin (deflated 7%)
  adding: content/model/group1-shard7of14.bin (deflated 7%)
  adding: content/model/group1-shard4of14.bin (deflated 7%)
  adding: content/model/group1-shard6of14.bin (deflated 7%)
  adding: content/model/group1-shard2of14.bin (deflated 8%)
  adding: content/model/group1-shard5of14.bin (deflated 7%)
  adding: content/model/group1-shard3of14.bin (deflated 7%)
  adding: content/model/group1-shard13of14.bin (deflated 8%)
  adding: content/model/group1-shard12of14.bin (deflated 7%)
  adding: content/model/group1-shard11of14.bin (deflated 7%)
  adding: content/model/group1-shard8of14.bin (deflated 7%)
  adding: content/model/group1-shard14of14.bin (deflated 8%)


In [29]:
from google.colab import files
files.download('/content/file.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
import json

x = json.dumps(vocab)
y = x.encode('utf-8')
with open('tokens.json', 'w') as f:
  f.write(str(y))

files.download('tokens.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>