<a href="https://colab.research.google.com/github/rootdrew27/cyberbullying-ml/blob/main/DataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning

- This notebook implements the functions to clean our cyberbullying data
- It replaces mentions with <@> and hastags are removed. (See implementation for more details)

In [None]:
!pip install langdetect
!pip install nltk
!pip install contractions

In [3]:
# data management
import pandas as pd
import numpy as np

# preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from langdetect import detect, DetectorFactory, LangDetectException
import contractions

import tensorflow as tf
import keras

In [None]:
df = pd.read_csv('./cyberbullying_tweets.csv', header=0)
pd.set_option('display.max_colwidth', None)
df.head()

**Convert string labels to integer representation.**

In [None]:
class_labels = {
    'not_cyberbullying':0,
    'religion':1,
    'age':2,
    'gender':3,
    'ethnicity':4,
    'other_cyberbullying':5
}
df['cyberbullying_type'] = df['cyberbullying_type'].replace(class_labels).astype(int)

**Cleaning functions**

In [None]:
# remove tweets with more than 280 characters, as this is past the tweet limit and thus there must have been an issue loading the data
def remove_bad_data(text):
  return text if len(text) < 280 else ""

def standardize(text):
    return text.lower()

# Function to check if the text is in English, and return an empty string if it's not
def remove_non_english(text):
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "unknown"
    return text if lang == "en" else ""

# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

def remove_entities(text):
    text = re.sub(r'&[a-z]+;', r' ', text) #remove html entities
    text = re.sub(r'https?\://S*', r' ', text) # remove links
    text = re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text) #remove url shorteners
    text = re.sub(r'#\S*', r'', text) #remove hastags
    text = re.sub(r'[^\x00-\x7F]+', r'', text) #remove non-ascii characters
    text = re.sub(r'[!$%^&*+=\-_()[\]\\;|:`~\'",./?<>}{]', r' ', text) #remove punctuation and special chars
    text = re.sub(r'[0-9]', r' ', text) #remove numbers
    text = re.sub(r'@\S*', r'@', text) # normalize mentions
    text = re.sub(r'\s', r' ', text) #replace whitespace chars with a single space
    return text

# Lemmatize words
# def lemmatize(text):
#     words = word_tokenize(text)
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
#     return ' '.join(lemmatized_words)

def remove_stop_words(text):
    text = " ".join([word for word in text.split(" ") if word not in stopWords])
    return text

def remove_excess_spaces(text):
    return re.sub("\s\s+" , " ", text)

def remove_blank_chars(text):
    return " ".join([char for char in text if char != ''])

def remove_tweets_with_few_words(text):
    if len(text.split(" ")) < 4:
        text = ""
    return text

elo_word_re_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'

#Naive impl of elongated word replacer
def replace_elongated_words(text):
    return re.sub(elo_word_re_pattern, r'\1\3\4', text)

def preprocess(text):
    text = remove_bad_data(text)
    text = remove_non_english(text)
    text = standardize(text)
    text = replace_elongated_words(text)
    text = expand_contractions(text)
    text = remove_entities(text)
    #text = remove_stop_words(text)
    text = remove_excess_spaces(text)
    text = remove_tweets_with_few_words(text)
    return text


In [None]:
# Apply cleaning functions (specified in preprocess)

df.tweet_text = df.tweet_text.apply(preprocess)

In [None]:
df.dropna(axis=0, inplace=True) #drop rows that contain any null values
df.reset_index(drop=True) #reset the indexes