Importing all the necessary dependencies

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np

Importing the dataset

In [3]:
rawDataset = pd.read_csv('/home/rumbleftw/Documents/Codes/global-censorship/datasets/text/finalData.csv', index_col=[0])

In [5]:
rawDataset['text'] = rawDataset['text'].values.astype('str')

Tokenizing the word vocab

In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.oov_token = '<shobdobhandarErBahire>'
tokenizer.fit_on_texts(rawDataset['text'])
vocab = tokenizer.word_index
vocabSize = len(vocab)
vocabSize

217799

Testing the tokenized words

Sequencing the whole dataset and padding to uniform shape

In [8]:
textData = tokenizer.texts_to_sequences(rawDataset['text'])
paddedTextData = tf.keras.preprocessing.sequence.pad_sequences(textData, padding='pre')
paddedTextData.shape, paddedTextData

((184354, 1403),
 array([[    0,     0,     0, ...,    17,   669,    75],
        [    0,     0,     0, ...,    21,  7480,  1142],
        [    0,     0,     0, ...,    16, 16598,  7024],
        ...,
        [    0,     0,     0, ...,   548,   794,  9428],
        [    0,     0,     0, ...,     3,    38,    57],
        [    0,     0,     0, ...,     5, 17461,    82]], dtype=int32))

Splitting into training and testing datasets

In [9]:
trainLength = 150000


xTrain = paddedTextData[:trainLength]
yTrain = rawDataset['is_offensive'].to_numpy()[:trainLength]

xTest = paddedTextData[trainLength:]
yTest = rawDataset['is_offensive'].to_numpy()[trainLength:]

In [10]:
yTest.shape, xTest.shape

((34354,), (34354, 1403))

Defining the model

In [12]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocabSize, 64, input_length=1403))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Training the model

In [14]:
model.fit(xTrain, yTrain, epochs=5)

2022-06-26 00:26:10.249110: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 841800000 exceeds 10% of free system memory.


Epoch 1/5
 165/4688 [>.............................] - ETA: 28:52 - loss: 0.5341 - accuracy: 0.8040

KeyboardInterrupt: 

Evaluating on testing dataset

In [81]:
model.evaluate(xTest[:100], yTest[:100])



[0.34257325530052185, 0.8999999761581421]

Real life testing

In [82]:
texts = ["We are doing our project", "you are a piece of shit", "you bloody bitch", "good night brother"]

sequencedText = tokenizer.texts_to_sequences(texts)
print(sequencedText)
paddedText = tf.keras.preprocessing.sequence.pad_sequences(sequencedText, padding='pre')
print(paddedText)
model.predict(paddedText)

[[58, 18, 264, 170, 507], [7, 18, 5, 893, 4, 430], [7, 3286, 110], [99, 1038, 1673]]
[[   0   58   18  264  170  507]
 [   7   18    5  893    4  430]
 [   0    0    0    7 3286  110]
 [   0    0    0   99 1038 1673]]


array([[1.2183372e-11],
       [9.9992746e-01],
       [1.0000000e+00],
       [1.2749022e-01]], dtype=float32)