In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

import nltk
nltk.download('stopwords')
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.manifold import TSNE

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rutuj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df = pd.read_csv('data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20001 entries, 0 to 20000
Data columns (total 3 columns):
Unnamed: 0    20001 non-null int64
content       20001 non-null object
label         20001 non-null int64
dtypes: int64(2), object(1)
memory usage: 468.9+ KB


## What does cyber bullying look like?

In [7]:
df[df.label==0].head()

Unnamed: 0.1,Unnamed: 0,content,label
7822,7822,yeah I got 2 backups for all that. I just hate...,0
7823,7823,I hate using my BB but love my iPhone. Haven'...,0
7824,7824,wow lol sounds like a lot of piss then hehehe,0
7825,7825,not a damn thang..the typical rap beef. one pe...,0
7826,7826,well damn!! where have you been when i have ne...,0


In [8]:
df[df.label==1].head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,Get fucking real dude.,1
1,1,She is as dirty as they come and that crook ...,1
2,2,why did you fuck it up. I could do it all day...,1
3,3,Dude they dont finish enclosing the fucking s...,1
4,4,WTF are you talking about Men? No men thats n...,1


## Text cleaning

In [9]:
import re
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

df['content'] = df['content'].map(lambda x: clean_text(x))

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,get fuck real dude,1
1,1,dirti come crook rengel dem fuck corrupt joke ...,1
2,2,fuck up could day too let hour ping later sche...,1
3,3,dude dont finish enclos fuck shower hate half ...,1
4,4,wtf talk men men that menag that gay,1


## Tokenize sentences and create sequences

In [11]:
### Create sequence
vocabulary_size = 1000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])
data = pad_sequences(sequences, maxlen=10)

## 1. CNN + LSTM model

In [63]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=10))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])
    return model_conv
model_conv = create_conv_model()
model_conv.fit(data, df['label'], validation_split=0.2, epochs = 10)

Train on 16000 samples, validate on 4001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2274506fbe0>

In [13]:
import numpy as np
embeddings_index = dict()
f = open('glove.6B\\glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [14]:
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [17]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=10, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_glove.fit(data, df['label'], validation_split=0.2, epochs = 10)

Train on 16000 samples, validate on 4001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c275f5dfd0>