In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('files/cyberbullying_tweets.csv')

In [4]:
df = pd.get_dummies(df, columns = ['cyberbullying_type'])

In [5]:
new_cols = ['tweet_text', 'age','ethnicity','gender','not_cyberbullying','other_cyberbullying','religion']
df.columns = new_cols

## For a binary classifier, we just need to know what is cyberbullying, and what's not cyberbullying

In [6]:
df = df.drop(columns = ['age','ethnicity','gender','other_cyberbullying','religion'])

## I'd prefer to have my target be something that evaluates True for potentially harmful, not for 'not cyberbullying', so I'm going to switch that up.

In [7]:
df['potentially_harmful'] = [int(not val) for val in df['not_cyberbullying']]
df = df.drop(columns = ['not_cyberbullying'])

## I'm going to see if I can get away with not lemmatizing when I preprocess the text, because it would take ages to run this, and I'm doing this all on my local machine.

In [8]:
from data_preprocessing.preprocess_text import pre_process_text
import tqdm

df['tweet_text'] = [pre_process_text(text, lemmatizer = 'False') for text in tqdm.tqdm(df['tweet_text'])]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sangersteel/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sangersteel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


symbols:  ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '®', '\t', '\n', '\r']


100%|██████████| 47692/47692 [00:02<00:00, 16394.37it/s]


In [9]:
df = df.sample(frac=1)

In [10]:
df

Unnamed: 0,tweet_text,potentially_harmful
9227,throw bum show single fortune company call you...,1
21246,tweet directed commie muslim men looking like ...,1
4109,halalfam biebervalue greenlinerzjm going block...,0
42184,el p weak az fuck big words uses makes look du...,1
598,woo wait see happens mkr,0
...,...,...
26297,legal advice common sense help people know say...,1
30832,looks like interesting night mkr http co doncc...,1
3345,need skype verification keybase io,0
7041,pickaxe new crowbar http co bcspxtotge,0


In [11]:
x = df['tweet_text']
y = df['potentially_harmful']

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0)

In [13]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
import tensorflow

def build_model(max_tokens, max_len, dropout):

    vectorize_layer = TextVectorization(
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=max_len,
    )
    vectorize_layer.adapt(x)
    
    model = Sequential()
    model.add(Input(shape=(1,), dtype=tensorflow.string))   ## <=== enable str inputs
    model.add(vectorize_layer)    ## <==== add TextVectorization inside Sequential
    model.add(Embedding(max_tokens + 1, 128))
    model.add(LSTM(64, dropout=dropout, recurrent_dropout=dropout))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model   

In [14]:
param_grid = {
    "max_tokens" : [100,1000],
    "max_len" : [10,100],
    "dropout" : [0.1, 0.2],
    "epochs" : [3,6]
}
model = GridSearchCV(KerasClassifier(build_model), param_grid, cv=3, scoring='accuracy')

In [15]:
model.fit(x_train, y_train, verbose=2)

2022-02-03 16:46:12.600737: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/3
795/795 - 6s - loss: 0.3367 - accuracy: 0.8449 - 6s/epoch - 8ms/step
Epoch 2/3
795/795 - 6s - loss: 0.3145 - accuracy: 0.8537 - 6s/epoch - 7ms/step
Epoch 3/3
795/795 - 6s - loss: 0.3124 - accuracy: 0.8544 - 6s/epoch - 7ms/step
Epoch 1/3
795/795 - 6s - loss: 0.3406 - accuracy: 0.8462 - 6s/epoch - 8ms/step
Epoch 2/3
795/795 - 5s - loss: 0.3176 - accuracy: 0.8546 - 5s/epoch - 7ms/step
Epoch 3/3
795/795 - 6s - loss: 0.3150 - accuracy: 0.8546 - 6s/epoch - 7ms/step
Epoch 1/3
795/795 - 6s - loss: 0.3394 - accuracy: 0.8457 - 6s/epoch - 8ms/step
Epoch 2/3
795/795 - 6s - loss: 0.3149 - accuracy: 0.8550 - 6s/epoch - 7ms/step
Epoch 3/3
795/795 - 6s - loss: 0.3117 - accuracy: 0.8554 - 6s/epoch - 7ms/step
Epoch 1/3
795/795 - 7s - loss: 0.3212 - accuracy: 0.8532 - 7s/epoch - 9ms/step
Epoch 2/3
795/795 - 5s - loss: 0.2797 - accuracy: 0.8644 - 5s/epoch - 7ms/step
Epoch 3/3
795/795 - 6s - loss: 0.2666 - accuracy: 0.8716 - 6s/epoch - 7ms/step
Epoch 1/3
795/795 - 6s - loss: 0.3210 - accuracy: 0.

GridSearchCV(cv=3,
             estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x1188635b0>,
             param_grid={'dropout': [0.1, 0.2], 'epochs': [3, 6],
                         'max_len': [10, 100], 'max_tokens': [100, 1000]},
             scoring='accuracy')

In [18]:
best_params = model.best_params_

In [19]:
best_params

{'dropout': 0.1, 'epochs': 3, 'max_len': 10, 'max_tokens': 1000}

In [20]:
optimized_model = build_model(dropout = 0.1, max_len = 10, max_tokens = 1000)

In [25]:
optimized_model.fit(x_train, y_train, epochs = 3, validation_split = 0.2, callbacks = tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose = 1))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 00003: early stopping


<keras.callbacks.History at 0x17d272d60>

In [26]:
optimized_model.evaluate(x_test, y_test)



[0.3918721079826355, 0.853024423122406]