<a href="https://colab.research.google.com/github/rootdrew27/cyberbullying-ml/blob/main/RNN_working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langdetect
!pip install nltk
!pip install contractions

In [3]:
# data management
import pandas as pd
import numpy as np

# model
import tensorflow as tf
import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, LSTM

In [None]:
df = pd.read_csv('./cyberbullying_tweets (1).csv', header=0)
pd.set_option('display.max_colwidth', None)
df.sample(200, random_state=20)

In [None]:
df.duplicated().sum()
df = df[~df.duplicated()]

In [None]:
class_labels = {
    'not_cyberbullying':0,
    'religion':1,
    'age':2,
    'gender':3,
    'ethnicity':4,
    'other_cyberbullying':5
}
df['cyberbullying_type'] = df['cyberbullying_type'].replace(class_labels).astype(int)

# Convert the target labels to a list for the purpose of making a list of predictions
target_labels = tf.keras.utils.to_categorical(df['cyberbullying_type'], 6, 'int')

6

# Preprocessing

In [None]:
import re
from langdetect import detect, DetectorFactory, LangDetectException
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import contractions

nltk.download('stopwords')

stopWords = stopwords.words('english')
customStopWords = ['rt']
stopWords = stopWords + customStopWords
stopWords.remove('no')
stopWords.remove('not')
stopWords.remove('nor')

DetectorFactory.seed = 0 #seed for the lang detector

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def standardize(text):
    return text.lower()

# Function to check if the text is in English, and return an empty string if it's not
def remove_non_english(text):
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "unknown"
    return text if lang == "en" else ""

# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

def remove_entities(text):
    text = re.sub(r'&[a-z]+;', r' ', text) #remove html entities
    text = re.sub(r'https?\://S*', r' ', text) # remove links
    text = re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text) #remove url shorteners
    text = re.sub(r'#\S*', r'', text) #remove hastags
    text = re.sub(r'[^\x00-\x7F]+', r'', text) #remove non-ascii characters
    text = re.sub(r'[!$%^&*+=\-_()[\]\\;|:`~\'",./?<>}{]', r' ', text) #remove punctuation and special chars
    text = re.sub(r'[0-9]', r' ', text) #remove numbers
    text = re.sub(r'@\S*', r'@', text) # normalize mentions
    text = re.sub(r'\s', r' ', text) #replace whitespace chars with a single space
    return text

# Lemmatize words
# def lemmatize(text):
#     words = word_tokenize(text)
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
#     return ' '.join(lemmatized_words)

def remove_stop_words(text):
    text = " ".join([word for word in text.split(" ") if word not in stopWords])
    return text

def remove_excess_spaces(text):
    return re.sub("\s\s+" , " ", text)

def remove_blank_chars(text):
    return " ".join([char for char in text if char != ''])

def remove_tweets_with_few_words(text):
    if len(text.split(" ")) < 4:
        text = ""
    return text

elo_word_re_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'

#Naive impl of elongated word replacer
def replace_elongated_words(text):
    return re.sub(elo_word_re_pattern, r'\1\3\4', text)

def preprocess(text):
    text = remove_non_english(text)
    text = standardize(text)
    text = replace_elongated_words(text)
    text = expand_contractions(text)
    text = remove_entities(text)
    #text = remove_stop_words(text)
    text = remove_excess_spaces(text)
    text = remove_tweets_with_few_words(text)
    return text


In [None]:
df.tweet_text = df.tweet_text.apply(preprocess)

In [None]:
# use cleaned data
clean_df = pd.read_csv("./cleaned_data1")

In [None]:
class_labels = {
    'not_cyberbullying':0,
    'religion':1,
    'age':2,
    'gender':3,
    'ethnicity':4,
    'other_cyberbullying':5
}
df['cyberbullying_type'] = df['cyberbullying_type'].replace(class_labels).astype(int)

# Convert the target labels to a list for the purpose of making a list of predictions
target_labels = tf.keras.utils.to_categorical(df['cyberbullying_type'], 6, 'int')

In [None]:
from sklearn.model_selection import train_test_split

#Split 1
x_Train_1, x_Test_1, y_Train_1, y_Test_1 = train_test_split(
    df.tweet_text,
    df.cyberbullying_type,
    test_size=.2,
    random_state=100,
    shuffle=True
)

#Split 2
x_Train_2, x_Test_2, y_Train_2, y_Test_2 = train_test_split(
    df.tweet_text,
    df.cyberbullying_type,
    test_size=.3,
    random_state=100,
    shuffle=True
)

#Split 3
x_Train_3, x_Test_3, y_Train_3, y_Test_3 = train_test_split(
    df.tweet_text,
    df.cyberbullying_type,
    test_size=.4,
    random_state=100,
    shuffle=True
)

In [None]:
tfds_train_1 = tf.data.Dataset.from_tensor_slices((x_Train_1, y_Train_1))
tfds_test_1 = tf.data.Dataset.from_tensor_slices((x_Test_1, y_Test_1))

tfds_train_2 = tf.data.Dataset.from_tensor_slices((x_Train_2, y_Train_2))
tfds_test_2 = tf.data.Dataset.from_tensor_slices((x_Test_2, y_Test_2))

tfds_train_3 = tf.data.Dataset.from_tensor_slices((x_Train_3, y_Train_3))
tfds_test_3 = tf.data.Dataset.from_tensor_slices((x_Test_3, y_Test_3))

In [None]:
BATCH_SIZE = 64

batched_train_1 = tfds_train_1.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
batched_test_1 = tfds_test_1.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

batched_train_2 = tfds_train_2.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
batched_test_2 = tfds_test_2.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

batched_train_3 = tfds_train_3.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
batched_test_3 = tfds_test_3.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
VOCAB_SIZE = 35000
encoder = layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(batched_train_1.map(lambda text, label: text))
n_vocab = [word for word in encoder.get_vocabulary() if re.match('nig[ae]', word) != None]

NameError: name 'layers' is not defined

In [None]:
embedder = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, mask_zero=True)

In [None]:
rnn_layer = layers.LSTM(64)

model = Sequential()
model.add(encoder)
model.add(embedder)
model.add(rnn_layer)
model.add(Dense(64))
model.add(Dense(6, activation='softmax'))

In [None]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(batched_train, epochs=10)

In [None]:
import numpy as np
sample_text = ('fuck niggas')

pred = model.predict(np.array([sample_text]))
pred



In [None]:
test_loss, test_acc = model.evaluate(batched_test)

