### Experiment 1

Test model on Dataset 1 to establish a baseline (with relabeling)


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

import json
import sys
sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\src')
from utils.results import append_results_to_json, create_results_file

from pathlib import Path
import random

In [3]:
# CONSTANTS
RANDOM_SEED = 115
DATA_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\data\\en_only')
EXPERIMENTS_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments')
RESULT_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments\\results\\exp1')

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [7]:
create_results_file(RESULT_PATH / 'exp1.json')

Results file already exists. Not creating new one


In [4]:
df = pd.read_csv( DATA_PATH / '48000_cyberbullying_tweets_basic_clean.csv')

In [5]:
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [6]:
df['label'].value_counts()

label
religion     7942
age          7910
gender       7396
ethnicity    7319
other        6731
notcb        6377
Name: count, dtype: int64

In [7]:
# relabel the data so that all cyberbullying data is a single class
# relabel the data so that all cyberbullying data is a single class

# {'age':0, 'gender':1, 'other':2, 'religion':3, 'ethnicity':4, 'notcb':5}

# Assuming 'label' column contains the labels and 'notcb' is the non-cyberbullying class
cyberbullying_classes = ['age', 'gender', 'other', 'religion', 'ethnicity']

# Relabel the data
df['label'] = df['label'].apply(lambda x: 0 if x in cyberbullying_classes else 1)

# Verify the relabeling
print(df['label'].value_counts())

label
0    37298
1     6377
Name: count, dtype: int64


In [11]:
create_results_file(RESULT_PATH / 'exp1.json')

In [12]:
# XGBClassifer Tests

# split dataset 3 ways 80/20, 70/30, 60/40

TEST_SIZES = [0.2,0.3,0.4]

for i, size in enumerate(TEST_SIZES):

    idx = 4 * i

    for j, vect in enumerate([CountVectorizer(), CountVectorizer(ngram_range=(1,2)), TfidfVectorizer(), TfidfVectorizer(ngram_range=(1,2))]):

        x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=size, shuffle=True, random_state=RANDOM_SEED)

        x_train = vect.fit_transform(x_train)
        # calculate the number of OOV features
        vect2 = CountVectorizer()
        vect2.fit(x_test)
        feats_train = vect.get_feature_names_out()
        feats_test = vect2.get_feature_names_out()
        oov_feats = np.setdiff1d(feats_test, feats_train)

        x_test = vect.transform(x_test)

        m = xgb.XGBClassifier()
        m.fit(x_train, y_train)
        y_pred = m.predict(x_test)
        result = classification_report(y_test, y_pred, output_dict=True)
        result['test_size'] = size
        result['num_OOV_feats'] = len(oov_feats)
        result['model_name'] = 'XGBClassifier'
        result['model'] = {k: str(v) if isinstance(v, type) else v for k, v in m.get_params().items()}
        result['vectorizer'] = vect.__class__.__name__
        result['vectorizer_params'] = {k: str(v) if isinstance(v, type) else v for k, v in vect.get_params().items()}
        append_results_to_json(result, RESULT_PATH / 'exp1.json')

In [16]:
with open(RESULT_PATH / 'exp1.json', 'r') as j:
     data = json.load(j)

data['results'] = results

with open(RESULT_PATH / 'exp1.json', 'w') as j:
     json.dump(data, j)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [2]:
type(result)

NameError: name 'result' is not defined

In [29]:
# CatBoostClassifier Tests

# split dataset 3 ways 80/20, 70/30, 60/40

TEST_SIZES = [0.2,0.3,0.4]

for size in TEST_SIZES:

    for vect in [CountVectorizer(), CountVectorizer(ngram_range=(1,2)), TfidfVectorizer(), TfidfVectorizer(ngram_range=(1,2))]:

        x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=size, shuffle=True, random_state=RANDOM_SEED)

        x_train = vect.fit_transform(x_train)
        x_test = vect.transform(x_test)

        train_pool = Pool(x_train, y_train)
        test_pool = Pool(x_test, y_test)

        m = CatBoostClassifier()
        m.fit(train_pool)
        y_pred = m.predict(test_pool)
        result = classification_report(y_test, y_pred, output_dict=True)
        result['split'] = size
        result['model_name'] = 'CatBoostClassifier'
        result['model'] = m.get_params()
        result['vectorizer'] = vect.__class__.__name__
        result['vectorizer_params'] = {k: str(v) if isinstance(v, type) else v for k, v in vect.get_params().items()}
        append_results_to_json(result, RESULT_PATH / 'exp1.json')

Learning rate set to 0.046982
0:	learn: 0.6604609	total: 72.1ms	remaining: 1m 12s
1:	learn: 0.6312614	total: 144ms	remaining: 1m 11s
2:	learn: 0.6051889	total: 214ms	remaining: 1m 11s
3:	learn: 0.5819159	total: 285ms	remaining: 1m 10s
4:	learn: 0.5592372	total: 370ms	remaining: 1m 13s
5:	learn: 0.5390894	total: 439ms	remaining: 1m 12s
6:	learn: 0.5218519	total: 510ms	remaining: 1m 12s
7:	learn: 0.5041463	total: 597ms	remaining: 1m 14s
8:	learn: 0.4853800	total: 670ms	remaining: 1m 13s
9:	learn: 0.4742055	total: 742ms	remaining: 1m 13s
10:	learn: 0.4642346	total: 812ms	remaining: 1m 12s
11:	learn: 0.4487067	total: 882ms	remaining: 1m 12s
12:	learn: 0.4384827	total: 953ms	remaining: 1m 12s
13:	learn: 0.4266573	total: 1.03s	remaining: 1m 12s
14:	learn: 0.4189148	total: 1.1s	remaining: 1m 12s
15:	learn: 0.4081025	total: 1.17s	remaining: 1m 12s
16:	learn: 0.4000228	total: 1.25s	remaining: 1m 11s
17:	learn: 0.3905706	total: 1.32s	remaining: 1m 11s
18:	learn: 0.3825471	total: 1.39s	remaining:

In [35]:
import json

with open(RESULT_PATH / 'exp1.json', 'r') as file:
    results = json.load(file)['results']

xgboost_precision_0 = [result['0']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
xgboost_precision_1 = [result['1']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
print(f'XGBClassifier Average Recall for class 0: {sum(xgboost_precision_0) / len(xgboost_precision_0)}')
print(f'XGBClassifier Average Recall for class 1: {sum(xgboost_precision_1) / len(xgboost_precision_1)}')

# Get the result with the highest precision for class 1
best_result = max(results, key=lambda result: result['1']['recall'] if result['model_name'] == 'XGBClassifier' else 0)
print(f'Best result with highest recall for class 1: {best_result}')


xgboost_results = [result['accuracy'] for result in results if result['model_name'] == 'XGBClassifier']

print(f'XGBClassifier Average Accuracy: {sum(xgboost_results) / len(xgboost_results)}')

XGBClassifier Average Recall for class 0: 0.9750233583547211
XGBClassifier Average Recall for class 1: 0.33446675486932664
Best result with highest recall for class 1: {'0': {'precision': 0.9002733371987078, 'recall': 0.9690620542082738, 'f1-score': 0.9334020352956331, 'support': 11216.0}, '1': {'precision': 0.6631067961165048, 'recall': 0.36195018547959723, 'f1-score': 0.46828933836133013, 'support': 1887.0}, 'accuracy': 0.8816301610318248, 'macro avg': {'precision': 0.7816900666576063, 'recall': 0.6655061198439356, 'f1-score': 0.7008456868284816, 'support': 13103.0}, 'weighted avg': {'precision': 0.8661183144541366, 'recall': 0.8816301610318248, 'f1-score': 0.8664198434987143, 'support': 13103.0}, 'split': 0.3, 'model_name': 'XGBClassifier', 'model': {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': 

In [32]:
# get the average precision for each class from the catboost results
catboost_precision_0 = [result['0']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
catboost_precision_1 = [result['1']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
print(f'CatBoostClassifier Average Precision for class 0: {sum(catboost_precision_0) / len(catboost_precision_0)}')
print(f'CatBoostClassifier Average Precision for class 1: {sum(catboost_precision_1) / len(catboost_precision_1)}')

# get average accuracy for CatBoostClassifier
catboost_results = [result['accuracy'] for result in results if result['model_name'] == 'CatBoostClassifier']
print(f'CatBoostClassifier Average Accuracy: {sum(catboost_results) / len(catboost_results)}')

CatBoostClassifier Average Precision for class 0: 0.9771058163378871
CatBoostClassifier Average Precision for class 1: 0.3149497674600516
CatBoostClassifier Average Accuracy: 0.8819196998789464


In [41]:
vect = CountVectorizer()
x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, shuffle=True, random_state=RANDOM_SEED)
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)
m = xgb.XGBClassifier(scale_pos_weight=5)
m.fit(x_train, y_train)
y_pred = m.predict(x_test)
result = classification_report(y_test, y_pred)
print(result)

              precision    recall  f1-score   support

           0       0.98      0.80      0.88      7496
           1       0.43      0.90      0.58      1239

    accuracy                           0.82      8735
   macro avg       0.71      0.85      0.73      8735
weighted avg       0.90      0.82      0.84      8735

