### Experiment 0
Test model on Dataset 1

In [29]:
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

import sys
sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\src')
from utils.results import append_results_to_json

from pathlib import Path
import random

This experiment tests ability of models trained on Dataset 1 to perform on Dataset 1 (via cross-validation)

In [30]:
# CONSTANTS
RANDOM_SEED = 115
DATA_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\data\\en_only')
EXPERIMENTS_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments')
RESULT_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments\\results\\exp0')

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [31]:
df = pd.read_csv( DATA_PATH / '48000_cyberbullying_tweets_basic_clean.csv')

In [32]:
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [33]:
df['label'].value_counts()


label
religion     7942
age          7910
gender       7396
ethnicity    7319
other        6731
notcb        6377
Name: count, dtype: int64

In [10]:
file_path = RESULT_PATH / 'exp0_D1_1.json'

# Check if the file exists, if not create it
if not file_path.exists():
    with open(file_path, 'w'):
        pass


In [34]:
df['label'] = df['label'].map({'age':0, 'gender':1, 'other':2, 'religion':3, 'ethnicity':4, 'notcb':5})


Test a model trained on Dataset 1 (with relabeling) with each vectorizing strategy



In [57]:
# XGBClassifer Tests

# split dataset 3 ways 80/20, 70/30, 60/40

TEST_SIZES = [0.2,0.3,0.4]

# with open(EXPERIMENTS_PATH / 'params.json', 'r') as file:
#     params_list = json.load(file)['wo_re']

for size in TEST_SIZES:

    x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=size, shuffle=True, random_state=RANDOM_SEED)

    vect = TfidfVectorizer() # or CountVectorizer()

    x_train = vect.fit_transform(x_train)
    x_test = vect.transform(x_test)

    m = xgb.XGBClassifier()
    m.fit(x_train, y_train)
    y_pred = m.predict(x_test)
    result = classification_report(y_test, y_pred, output_dict=True)
    result['split'] = size
    result['model_name'] = 'XGBClassifier'
    result['model'] = m.get_params()
    result['vectorizer'] = vect.__class__.__name__
    result['vectorizer_params'] = {k: str(v) if isinstance(v, type) else v for k, v in vect.get_params().items()}
    append_results_to_json(result, RESULT_PATH / 'exp0_D1_1.json')



In [22]:
# CatBoostClassifier Tests

# split dataset 3 ways 80/20, 70/30, 60/40

TEST_SIZES = [0.2,0.3,0.4]

for size in TEST_SIZES:

    X, Y = df.drop('label', axis=1), df['label']
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=size, shuffle=True, random_state=RANDOM_SEED)

    train_pool = Pool(x_train, y_train, text_features=['tweet'])
    test_pool = Pool(x_test, y_test, text_features=['tweet'])

    m = CatBoostClassifier()
    m.fit(train_pool)
    preds = m.predict(test_pool)
    preds = [pred[0] for pred in preds]
    result = classification_report(y_test, preds, output_dict=True)
    result['test_split'] = size
    result['model_name'] = 'CatBoostClassifier'
    result['model'] = m.get_all_params()
    append_results_to_json(result, RESULT_PATH / 'exp0_D1_1.json')

Learning rate set to 0.094849
0:	learn: 1.5148195	total: 138ms	remaining: 2m 18s
1:	learn: 1.3497790	total: 323ms	remaining: 2m 41s
2:	learn: 1.2247174	total: 511ms	remaining: 2m 49s
3:	learn: 1.1308492	total: 687ms	remaining: 2m 50s
4:	learn: 1.0528921	total: 866ms	remaining: 2m 52s
5:	learn: 0.9951878	total: 1.04s	remaining: 2m 52s
6:	learn: 0.9399792	total: 1.23s	remaining: 2m 53s
7:	learn: 0.8945076	total: 1.41s	remaining: 2m 54s
8:	learn: 0.8570975	total: 1.59s	remaining: 2m 54s
9:	learn: 0.8229338	total: 1.77s	remaining: 2m 55s
10:	learn: 0.7906324	total: 1.95s	remaining: 2m 55s
11:	learn: 0.7627028	total: 2.14s	remaining: 2m 56s
12:	learn: 0.7369728	total: 2.32s	remaining: 2m 56s
13:	learn: 0.7145392	total: 2.5s	remaining: 2m 56s
14:	learn: 0.6942097	total: 2.71s	remaining: 2m 57s
15:	learn: 0.6761424	total: 2.89s	remaining: 2m 57s
16:	learn: 0.6593788	total: 3.08s	remaining: 2m 58s
17:	learn: 0.6457154	total: 3.27s	remaining: 2m 58s
18:	learn: 0.6327588	total: 3.45s	remaining: 

In [24]:
# get average accuracy for xgboost

import json

with open(RESULT_PATH / 'exp0_D1_1.json', 'r') as file:
    results = json.load(file)['results']

xgboost_results = [result['accuracy'] for result in results if result['model_name'] == 'XGBClassifier']

print(f'XGBClassifier Average Accuracy: {sum(xgboost_results) / len(xgboost_results)}')

XGBClassifier Average Accuracy: 0.8407607824714006


In [25]:
# get average accuracy for CatBoostClassifier
catboost_results = [result['accuracy'] for result in results if result['model_name'] == 'CatBoostClassifier']

print(f'CatBoostClassifier Average Accuracy: {sum(catboost_results) / len(catboost_results)}')


CatBoostClassifier Average Accuracy: 0.8415844008626236
