In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import random

In [2]:
back_df = pd.read_csv(r"C:\Users\hvvel\Downloads/tweet_translated.csv",engine='python')

In [3]:
back_df.shape

(38360, 3)

In [6]:
back_df.head

<bound method NDFrame.head of           id                                          text_data  profane_class
6478    6478  Dear inconsiderate ghetto shit on the train in...              1
26131  26131                          It has one Wrestlinglvoer              0
11719  11719  Tell me I'm beautiful like you say to all the ...              1
22081  22081  Question   HiI was just wondering if this seas...              0
36631  36631   With respect to the article on Tejas please t...              0
...      ...                                                ...            ...
24241  24241          Many thanks Technopat for drawing the ...              0
29387  29387       Removed those entries that are SMP clustered              0
24896  24896   sorry i have problems being  called usual sub...              0
26666  26666   Your mother looks like a toughasnails birdI s...              0
35573  35573  Kilgarvans population is calculated using DED ...              0

[38360 rows x 3 colum

In [5]:
back_df = back_df.sample(frac=1)

In [10]:
# remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hvvel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [11]:
back_df["text_data"] = back_df.text_data.map(remove_stopwords)

In [13]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(back_df.text_data)

In [14]:
len(counter)

66494

In [15]:
counter.most_common(5)

[('bitch', 5910),
 ('article', 5689),
 ('like', 5174),
 ('page', 4859),
 ('talk', 3593)]

In [16]:
num_unique_words = len(counter)

In [17]:

# Split dataset into training and test set
random.seed(100)
train_size = int(back_df.shape[0] * 0.7)

train_df = back_df[:train_size]
test_df = back_df[train_size:]

# split text and labels
train_sentences = train_df.text_data.to_numpy()
train_labels = train_df.profane_class.to_numpy()
test_sentences = test_df.text_data.to_numpy()
test_labels = test_df.profane_class.to_numpy()

In [18]:
train_sentences.shape, test_sentences.shape

((26852,), (11508,))

In [19]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [20]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [21]:
print(train_sentences[0:5])
print(train_sequences[0:5])

['dear inconsiderate ghetto shit train front know phone options right phone speaker'
 'one wrestlinglvoer' "tell i'm beautiful like say bitches."
 'question hii wondering season still'
 'respect article tejas please tell revisions removed sources bloggers far tell indian media outlets addition insisted changing type radar aircraft supposedly equipped without actually providing source substantiating claim provide credible source dead link way protest edit note still havent removed source added even though accessible link anymore']
[[878, 7676, 507, 32, 2642, 1040, 13, 807, 2994, 44, 807, 5459], [7, 21257], [147, 28, 1272, 3, 35, 10], [152, 21258, 1113, 1114, 53], [576, 2, 21259, 9, 147, 5117, 123, 50, 14861, 230, 147, 990, 319, 7677, 714, 11755, 879, 633, 6382, 2373, 3330, 14862, 97, 155, 2112, 60, 21260, 280, 439, 1695, 60, 778, 103, 42, 3223, 21, 114, 53, 489, 123, 60, 132, 29, 175, 3624, 103, 981]]


In [22]:
len_sequences = []
for one_seq in train_sentences:
    word_list = one_seq.split() 
    #print(len(word_list))
    len_sequences.append(len(word_list))
#print(len_sequences)
pd.Series(len_sequences).describe()

count    26852.000000
mean        18.654774
std         35.945907
min          0.000000
25%          5.000000
50%          9.000000
75%         15.000000
max       1000.000000
dtype: float64

In [23]:

# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 50

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, test_padded.shape

((26852, 50), (11508, 50))

In [24]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 32)            2127808   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 2,152,705
Trainable params: 2,152,705
Non-trainable params: 0
_________________________________________________________________


In [25]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [26]:
model.fit(train_padded, train_labels, epochs=20, validation_split=0.1, verbose=2)

Epoch 1/20
756/756 - 28s - loss: 0.4571 - accuracy: 0.8017 - val_loss: 0.4538 - val_accuracy: 0.8038
Epoch 2/20
756/756 - 23s - loss: 0.4511 - accuracy: 0.7965 - val_loss: 0.4629 - val_accuracy: 0.7777
Epoch 3/20
756/756 - 36s - loss: 0.3738 - accuracy: 0.8202 - val_loss: 0.2341 - val_accuracy: 0.9088
Epoch 4/20
756/756 - 36s - loss: 0.1663 - accuracy: 0.9462 - val_loss: 0.1247 - val_accuracy: 0.9576
Epoch 5/20
756/756 - 29s - loss: 0.0631 - accuracy: 0.9794 - val_loss: 0.1044 - val_accuracy: 0.9676
Epoch 6/20
756/756 - 23s - loss: 0.0282 - accuracy: 0.9917 - val_loss: 0.1166 - val_accuracy: 0.9643
Epoch 7/20
756/756 - 19s - loss: 0.0250 - accuracy: 0.9924 - val_loss: 0.1507 - val_accuracy: 0.9516
Epoch 8/20
756/756 - 19s - loss: 0.0174 - accuracy: 0.9952 - val_loss: 0.1517 - val_accuracy: 0.9650
Epoch 9/20
756/756 - 19s - loss: 0.0097 - accuracy: 0.9969 - val_loss: 0.1996 - val_accuracy: 0.9605
Epoch 10/20
756/756 - 22s - loss: 0.0066 - accuracy: 0.9982 - val_loss: 0.2037 - val_accura

<tensorflow.python.keras.callbacks.History at 0x1a184316c88>

In [27]:
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [28]:
print(test_sentences[10:20])

print(test_labels[10:20])
print(predictions[10:20])

['trash'
 'well used avoid seem irresistably drawn trying resolve brushfire wars hopefully something satisfactory come one conspire past ops'
 "know wouldn't."
 'im sorry feeling experience check block log rr blocks year ago three due reports filed one vanished editor editwarring time faced sanctions last one due rr report filed editor since banned sockpuppet blocked reverting edits twice anyway interesting proposal id like help however talk'
 'please ronald mallet john titor time travel articles fringe science published gave code sources web sources news paper sources ronald mallet oh please'
 'oh, please, faggot.'
 "let retards go food store, step boots foodtown, yall knows step blacks' shoes."
 'first thing zombie outbreak sell hoe'
 'bitch ralphy thanks tcoaakwohdfgv' 'men biggest hoes lmmao']
[0 0 1 0 0 1 1 1 1 1]
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, predictions)

0.9604622871046229

In [30]:
from sklearn.metrics import precision_score
precision_score(test_labels, predictions)

0.9652679830747531

In [31]:
from sklearn.metrics import recall_score
recall_score(test_labels, predictions)

0.9549973835688121