# Cyberbullying model using LSTM

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [157]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

## Preprocessing the dataset

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation))

In [3]:
df = pd.read_csv("anti-bully-data.csv")
df.head()

Unnamed: 0,label_bullying,text_message
0,0,yeah I got 2 backups for all that. I just hate...
1,0,I hate using my BB but love my iPhone. Haven'...
2,1,Get fucking real dude.
3,1,She is as dirty as they come and that crook ...
4,1,why did you fuck it up. I could do it all day...


In [4]:
messages = df['text_message']
y = df['label_bullying']

In [258]:
df['text_message']

0       yeah I got 2 backups for all that. I just hate...
1       I hate using my BB  but love my iPhone. Haven'...
2                                  Get fucking real dude.
3        She is as dirty as they come  and that crook ...
4        why did you fuck it up. I could do it all day...
5        Dude they dont finish enclosing the fucking s...
6        WTF are you talking about Men? No men thats n...
7       Ill save you the trouble sister. Here comes a ...
8        Im dead serious.Real athletes never cheat don...
9           wow lol sounds like a lot of piss then hehehe
10      not a damn thang..the typical rap beef. one pe...
11      ...go absolutely insane.hate to be the bearer ...
12      well damn!! where have you been when i have ne...
13      watching without a trace too...hate when i mis...
14      which they do most of the time:-P I don't hate...
15      Lmao  im watching the same thing ahaha. The ga...
16      LOL  no he said  What do you call a jail cell ...
17      truth 

In [43]:
def get_simple_pos(tag) :
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(review) :
    global max_len
    words = word_tokenize(review)
    output_words = []
    for word in words :
        if word.lower() not in stop_words :
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    max_len = max(max_len, len(output_words))
    return " ".join(output_words)

In [44]:
max_len = 0

In [257]:
print(messages[0])
messages = [clean_text(message) for message in messages]
print(messages[0])

yeah get 2 backup hate happen strugglin week ... handle tho
yeah get 2 backup hate happen strugglin week ... handle tho


In [237]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as file:
        word_to_vec_map = {}
        word_to_index = {}
        index_to_word = {}
        index = 0
        for line in file:
            line = line.strip().split()
            curr_word = line[0]
            word_to_index[curr_word] = index
            index_to_word[index] = curr_word
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            index += 1
    return word_to_index, index_to_word, word_to_vec_map

In [238]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [239]:
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = [w.lower() for w in X[i].split()]
        j = 0
        for word in sentence_words:
            if word in word_to_index:
                X_indices[i, j] = word_to_index[word]
            j += 1
    return X_indices

## The LSTM model

In [240]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["cucumber"].shape[0]
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [241]:
def NLPModel(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype='int32')
    
    embeddings = pretrained_embedding_layer(word_to_vec_map, word_to_index)(sentence_indices)
    
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(1)(X)
    X = Activation('sigmoid')(X)
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [242]:
model = NLPModel((max_len,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        (None, 30)                0         
_________________________________________________________________
embedding_23 (Embedding)     (None, 30, 50)            20000050  
_________________________________________________________________
lstm_39 (LSTM)               (None, 30, 128)           91648     
_________________________________________________________________
dropout_37 (Dropout)         (None, 30, 128)           0         
_________________________________________________________________
lstm_40 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_38 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 129

In [243]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [244]:
from sklearn.model_selection import train_test_split

In [245]:
X_train, X_test, Y_train, Y_test = train_test_split(messages, y, random_state = 0, test_size = 0.1)

In [246]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len)

In [247]:
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle = True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x1db47d04ef0>

In [248]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len)

In [249]:
loss, acc = model.evaluate(X_test_indices, Y_test)
print("Test accuracy = ", acc)

Test accuracy =  0.6734693646430969


## Accuracy of LSTM: 67.25%

In [250]:
model.save("model.h5")

## Predictions

In [279]:
text = "suck it"
text = [clean_text(text)]
text

['suck']

In [280]:
text = sentences_to_indices(text, word_to_index, max_len)

In [281]:
text

array([[23695.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.]])

In [282]:
model.predict(text)[0][0]

0.48035333

## Extras

In [283]:
import pickle

In [284]:
pickle.dump(word_to_index, open('word_to_index.pkl', 'wb'))