### Naive Bayes Classifier & Logistic Regression Classifier for Classification of cyberbullying tweets

# 1. Data Preparation

Load the libraries and the dataset

In [1]:
# import the libraries
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonstohrer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonstohrer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# load the dataset cyberbullying-tweets.csv
path = "data/cyberbullying-tweets.csv"
df = pd.read_csv(path)
# Show full content of cells
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was crapilicious! #mkr",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red velvet cupcakes?,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter.",not_cyberbullying
4,"@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account. Like Islam, it is all lies.",not_cyberbullying


Basic data exploration

In [6]:
def analyze_dataframe(dataframe):
    # Display basic information about the DataFrame
    print('-' * 45, '\n Nr. rows, nr. columns in the dataframe : \n', dataframe.shape, '\n')
    print('-' * 45, '\n Datatypes of the columns : \n', dataframe.dtypes, '\n')
    # Check for missing values
    missing_values = dataframe.isnull().sum()
    print('-' * 45, '\n Nr. Missing Values per column: \n', missing_values, '\n')
    # Check for duplicates
    print('-' * 45, '\n Nr. of duplicates: \n', dataframe.duplicated().sum())
    
analyze_dataframe(df)

--------------------------------------------- 
 Nr. rows, nr. columns in the dataframe : 
 (47692, 2) 

--------------------------------------------- 
 Datatypes of the columns : 
 tweet_text            object
cyberbullying_type    object
dtype: object 

--------------------------------------------- 
 Nr. Missing Values per column: 
 tweet_text            0
cyberbullying_type    0
dtype: int64 

--------------------------------------------- 
 Nr. of duplicates: 
 36


In [7]:
# Get duplicates
duplicates = df[df.duplicated()]
#(duplicates)
#df_clean = df.drop_duplicates()

#Duplicate example
#row_containing_pancakes = df[df['tweet_text'].str.contains("pancakes are selling")]
#(row_containing_pancakes)

# Print the shape of the DataFrame before and after removing duplicates
#print("Shape before removing duplicates:", df.shape)
#print("Shape after removing duplicates:", df_clean.shape)


In [8]:
value_counts = df["cyberbullying_type"].value_counts()
print('-' * 45, '\n Categories of cyberbullying: \n', value_counts, '\n')

# Group rows by the values in cyberbullying_type
grouped = df.groupby('cyberbullying_type').first()
print('-' * 45, '\n Sample of tweet per cyberbullying category: \n')
grouped

--------------------------------------------- 
 Categories of cyberbullying: 
 cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64 

--------------------------------------------- 
 Sample of tweet per cyberbullying category: 



Unnamed: 0_level_0,tweet_text
cyberbullying_type,Unnamed: 1_level_1
age,Here at home. Neighbors pick on my family and I. Mind you my son is autistic. It feels like high school. They call us names attack us for no reason and bully us all the time. Can't step on my front porch without them doing something to us
ethnicity,Hey dumb fuck celebs stop doing something for people for publicity on Facebook... Wtf happen to life u niggers are cowards.
gender,rape is real..zvasiyana nema jokes about being drunk or being gay or being lesbian...rape is not ones choice or wish..thtz where the sensitivity is coming from
not_cyberbullying,"In other words #katandandre, your food was crapilicious! #mkr"
other_cyberbullying,"@ikralla fyi, it looks like I was caught by it. I'm not a botter, so..."
religion,"Sudeep, did she invite him though? No right? Why are you getting worded up? You're okay with Parvesh Verma cause he speaks against Muslims but against an idiot like Imam because he called for chakka jam?"


In [120]:
print('Minimum nr chars in tweet_text: ', min(df['tweet_text'].str.len()))
print('Max nr of chars in tweet_text: ', max(df['tweet_text'].str.len()))
print('Average nr of chars in tweet_text: ', (df['tweet_text'].str.len()).mean().round())

Minimum nr chars in tweet_text:  1
Max nr of chars in tweet_text:  5018
Average nr of chars in tweet_text:  136.0


In [121]:
#creating a binary column is_cyberbullying
label_encoding_map = {'not_cyberbullying': 0}
df['is_cyberbullying'] = df['cyberbullying_type'].map(label_encoding_map).fillna(1).astype(int)

In [122]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,is_cyberbullying
0,"In other words #katandandre, your food was crapilicious! #mkr",not_cyberbullying,0
1,Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc,not_cyberbullying,0
2,@XochitlSuckkks a classy whore? Or more red velvet cupcakes?,not_cyberbullying,0
3,"@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter.",not_cyberbullying,0
4,"@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account. Like Islam, it is all lies.",not_cyberbullying,0


In [123]:
# Check for class imbalance (= data imbalance)
print('Class distribution of' , df['cyberbullying_type'].value_counts(normalize=True))


Class distribution of cyberbullying_type
religion               0.167701
age                    0.167575
gender                 0.167177
ethnicity              0.166925
not_cyberbullying      0.166590
other_cyberbullying    0.164032
Name: proportion, dtype: float64


This dataset looks balanced.

Preprocess the data and vectorization

In [125]:
def preprocess_text(text):
    # Define regular expression patterns
    emoticon_pattern = r'[:;=][-^]?[)D(|/\\]+[Pp]'
    # Remove emoticons
    tweet = re.sub(emoticon_pattern, '', text)
    # Define other regular expression patterns
    mention_pattern = r'@\w+\s*'
    hashtag_pattern = r'#'
    punctuation_pattern = '[' + re.escape(string.punctuation) + ']'
    url_pattern = r'http\S+|www\S+|https\S+'
    # Combine patterns into a single regular expression
    combined_pattern = '|'.join([mention_pattern, hashtag_pattern, punctuation_pattern, url_pattern])
    # Remove mentions, hashtags, punctuation, and URLs
    tweet = re.sub(combined_pattern, '', tweet)
    # Tokenize and remove stop words
    tokens = word_tokenize(tweet.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [132]:
# Preprocess the column tweet_text
df['cleaned_tweet'] = df['tweet_text'].apply(preprocess_text)


In [133]:
print(df['cleaned_tweet'][:1])

0    words katandandre food crapilicious mkr
Name: cleaned_tweet, dtype: object


In [None]:
def vectorize_text(texts, method='bag_of_words'):
    if method == 'bag_of_words':
        vectorizer = CountVectorizer()
    elif method == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid method. Choose 'bag_of_words' or 'tfidf'.")
    # Fit and transform the texts
    vectorized_texts = vectorizer.fit_transform(texts)
    
    return vectorized_texts, vectorizer

In [134]:
# Vectorize the preprocessed text data using TF-IDF
vectorized_texts_tfidf, vectorizer_tfidf = vectorize_text(df['cleaned_tweet'], method='tfidf')
# Display full arrays
#np.set_printoptions(threshold=np.inf)

# Print only the first row of the TF-IDF matrix
#print(vectorized_texts_tfidf[:1].toarray())

In [135]:
df[['tweet_text', 'cleaned_tweet']].head()

Unnamed: 0,tweet_text,cleaned_tweet
0,"In other words #katandandre, your food was crapilicious! #mkr",words katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc,aussietv white mkr theblock imacelebrityau today sunrise studio10 neighbours wonderlandten etc
2,@XochitlSuckkks a classy whore? Or more red velvet cupcakes?,classy whore red velvet cupcakes
3,"@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter.",meh p thanks heads concerned another angry dude twitter
4,"@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account. Like Islam, it is all lies.",isis account pretending kurdish account like islam lies


Split the data

In [136]:
#80% training and 20% testing sets.
X = vectorized_texts_tfidf
y2 = df['cyberbullying_type'] #target

X_train, X_test, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (38153, 48807)
Testing set size: (9539, 48807)


# Naive Bayes

In [143]:
"""Training naive bayes classifier (using scikit-learn)"""
nb_classifier2 = MultinomialNB()
nb_classifier2.fit(X_train, y_train2)

In [144]:
# Function to evaluate the models
def evaluate_model(y_test, y_pred):
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted') 
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Print the evaluation results and classification report
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(classification_report(y_test, y_pred))


In [145]:
# Prediction and evaluation
nb_pred2 = nb_classifier2.predict(X_test)

print("Evaluation of Naive Bayes model on Test Set:")
print("------------------------")
evaluate_model(y_test2, nb_pred2)

Evaluation of Naive Bayes model on Test Set:
------------------------
Accuracy:  0.7448
Precision: 0.7305
Recall:    0.7448
F1-score:  0.7197
                     precision    recall  f1-score   support

                age       0.72      0.98      0.83      1603
          ethnicity       0.83      0.92      0.87      1603
             gender       0.75      0.85      0.80      1531
  not_cyberbullying       0.68      0.35      0.47      1624
other_cyberbullying       0.62      0.41      0.49      1612
           religion       0.79      0.97      0.87      1566

           accuracy                           0.74      9539
          macro avg       0.73      0.75      0.72      9539
       weighted avg       0.73      0.74      0.72      9539



Naive Bayes: High recall for the categories of age, ethnicity, religion, precision is still okay for ethnicity but for the other classes it is not that high. Good recall at predicting the category gender, precision is a bit lower. The model is not very precise and recall is not the best when it comes to other types of cyberbullying and when it is not cyberbulling. 

# Logistic regression

In [148]:
"""Training Logistic regression classifier (using scikit-learn)"""
lr_classifier2 = LogisticRegression(max_iter=1000)
lr_classifier2.fit(X_train, y_train2)

In [149]:
#Prediciton and evaluation
lr_pred2 = lr_classifier2.predict(X_test)

print("Evaluation of Logistic regression model on Test Set:")
print("------------------------")
evaluate_model(y_test2, lr_pred2)

Evaluation of Logistic regression model on Test Set:
------------------------


Accuracy:  0.8151
Precision: 0.8228
Recall:    0.8151
F1-score:  0.8179
                     precision    recall  f1-score   support

                age       0.96      0.96      0.96      1603
          ethnicity       0.98      0.96      0.97      1603
             gender       0.91      0.81      0.86      1531
  not_cyberbullying       0.57      0.56      0.57      1624
other_cyberbullying       0.58      0.67      0.62      1612
           religion       0.94      0.93      0.94      1566

           accuracy                           0.82      9539
          macro avg       0.82      0.82      0.82      9539
       weighted avg       0.82      0.82      0.82      9539



In [150]:
# for curiosity, let's look at the confusion matrix
#conf_matrix_lr2 = confusion_matrix(y_test, lr_pred2)
#print("Confusion Matrix Logistic regression:")
#print(conf_matrix_lr2)


Logistic regression: Very good at predicting the categories of age, ethnicity, religion, with high precision and recall. Very good precision at predicting the category gender even though recall is a bit lower. The model is not very precise and recall is not the best when it comes to other types of cuberbullying and when it is not cyberbulling. 

Overall, the logistic regression performs better for this dataset, it is better at predicting multiclasses.