In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import random

In [2]:
cap_df = pd.read_csv(r"C:\Users\hvvel\Downloads/combined_original_clean_data.csv",engine='python')

In [3]:
cap_df.shape

(55802, 3)

In [6]:
cap_df.head

<bound method NDFrame.head of           ID                                          text_data  profane_class
15759  15759    These hoes really would sit in your face and...              1
49080  49080   bullshiti guess the poor have all the moneyta...              1
35265  35265  The details may help the aid of banning this u...              0
33727  33727  There is disagreement regarding whether Norman...              0
39085  39085                Thats hoser eh to you tcoQuLrtOWmLB              0
...      ...                                                ...            ...
49733  49733                   Hitler is also dead Coincidence               0
14390  14390  Great  If you had not removed it I was going t...              0
51224  51224   unblock  reason  Derogatory comments about ot...              0
36696  36696  Shut up I can do what the fuck I want   July  ...              1
10069  10069  The Rock isnt a blood relation of any of the A...              0

[55802 rows x 3 colum

In [5]:
cap_df = cap_df.sample(frac=1)

In [7]:
# remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hvvel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
cap_df["text_data"] = cap_df.text_data.map(remove_stopwords)

In [9]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(cap_df.text_data)

In [10]:
len(counter)

87527

In [11]:
counter.most_common(5)

[('fuck', 9878),
 ('bitch', 9438),
 ('article', 9051),
 ('page', 8372),
 ('like', 8258)]

In [12]:
num_unique_words = len(counter)

In [15]:

# Split dataset into training and test set
random.seed(100)
train_size = int(cap_df.shape[0] * 0.7)

train_df = cap_df[:train_size]
test_df = cap_df[train_size:]

# split text and labels
train_sentences = train_df.text_data.to_numpy()
train_labels = train_df.profane_class.to_numpy()
test_sentences = test_df.text_data.to_numpy()
test_labels = test_df.profane_class.to_numpy()

In [16]:
train_sentences.shape, test_sentences.shape

((39061,), (16741,))

In [17]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [18]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [19]:
print(train_sentences[0:5])
print(train_sequences[0:5])

['hoes really would sit face act like puttin world relationship fuckin whole'
 'bullshiti guess poor moneytax money movers rate working peopleyou teabaggers traitors'
 'details may help aid banning user vandalism continued ip country canada city nanaimo british columbia country code ca currency cad canada dollars private ip known proxy'
 'disagreement regarding whether norman finkelstein juan cole andor memri lie andor mislead since lot noncontroversial news sources available hamas go first controversial sources appropriate describing two sides taken various controversies important matters sources heard another substantive acrosstheboard accepted source opinion directly opposed finkelsteincolememri'
 'thats hoser eh tcoqulrtowmlb']
[[40, 46, 11, 1220, 457, 570, 5, 16066, 167, 1565, 313, 311], [29402, 425, 948, 29403, 454, 29404, 1774, 514, 20399, 3481, 11747], [787, 37, 77, 3584, 3203, 69, 133, 2013, 227, 510, 812, 481, 29405, 624, 4987, 510, 1410, 4818, 3670, 16067, 812, 3070, 1566, 2

In [20]:
len_sequences = []
for one_seq in train_sentences:
    word_list = one_seq.split() 
    #print(len(word_list))
    len_sequences.append(len(word_list))
#print(len_sequences)
pd.Series(len_sequences).describe()

count    39061.000000
mean        22.090371
std         44.589042
min          0.000000
25%          6.000000
50%         10.000000
75%         20.000000
max       1001.000000
dtype: float64

In [21]:

# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 50

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, test_padded.shape

((39061, 50), (16741, 50))

In [22]:
# Create LSTM model
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 32)            2800864   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 2,825,761
Trainable params: 2,825,761
Non-trainable params: 0
_________________________________________________________________


In [23]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [24]:
model.fit(train_padded, train_labels, epochs=20, validation_split=0.1, verbose=2)

Epoch 1/20
1099/1099 - 102s - loss: 0.2401 - accuracy: 0.9047 - val_loss: 0.1711 - val_accuracy: 0.9378
Epoch 2/20
1099/1099 - 93s - loss: 0.0920 - accuracy: 0.9712 - val_loss: 0.1865 - val_accuracy: 0.9352
Epoch 3/20
1099/1099 - 95s - loss: 0.0569 - accuracy: 0.9846 - val_loss: 0.2181 - val_accuracy: 0.9401
Epoch 4/20
1099/1099 - 52s - loss: 0.0420 - accuracy: 0.9885 - val_loss: 0.2555 - val_accuracy: 0.9409
Epoch 5/20
1099/1099 - 43s - loss: 0.0370 - accuracy: 0.9914 - val_loss: 0.2191 - val_accuracy: 0.9399
Epoch 6/20
1099/1099 - 46s - loss: 0.0283 - accuracy: 0.9933 - val_loss: 0.2131 - val_accuracy: 0.9386
Epoch 7/20
1099/1099 - 51s - loss: 0.0230 - accuracy: 0.9949 - val_loss: 0.2239 - val_accuracy: 0.9399
Epoch 8/20
1099/1099 - 49s - loss: 0.0220 - accuracy: 0.9942 - val_loss: 0.2879 - val_accuracy: 0.9340
Epoch 9/20
1099/1099 - 48s - loss: 0.0250 - accuracy: 0.9938 - val_loss: 0.2749 - val_accuracy: 0.9304
Epoch 10/20
1099/1099 - 52s - loss: 0.0219 - accuracy: 0.9935 - val_loss

<tensorflow.python.keras.callbacks.History at 0x292359767c8>

In [25]:
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [26]:
print(test_sentences[10:20])

print(test_labels[10:20])
print(predictions[10:20])

['thanks already voted'
 'dont think per precedent set originality expression necessary copyright protection mere photograph outofcopyright twodimensional work may protected american copyright law talk majestic titan'
 'birds feather stand peta stand sanity'
 'reply cant fuckin block kiss royal thai ass'
 'walter issacson president cnn footnoted winterberg recent biography einstein issacson wacko'
 'left suggestions talk page' 'sorry youre karpinski'
 'welcome hello welcome wikipedia appreciate encyclopedic contributions recent edits ones page gucci conform policies information see wikipedias policies vandalism limits acceptable additions youd like experiment wikis syntax please sandbox rather articles still questions new contributors help page write helpme message along question someone along answer shortly may also find following pages useful general introduction wikipedia five pillars wikipedia help pages tutorial hope enjoy editing wikipedian please sign name talk pages using four 

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, predictions)

0.9218087330505943

In [28]:
from sklearn.metrics import precision_score
precision_score(test_labels, predictions)

0.9273858921161826

In [29]:
from sklearn.metrics import recall_score
recall_score(test_labels, predictions)

0.9141104294478528