### Experiment 0
Test model on Dataset 1

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\src')
from utils.results import append_results_to_json

from pathlib import Path
import random

This experiment tests ability of models trained on Dataset 1 to perform on Dataset 1 (via cross-validation)

In [2]:
# CONSTANTS
RANDOM_SEED = 115
DATA_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\data\\en_only')
EXPERIMENTS_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments')
RESULT_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments\\results\\exp0')

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [25]:
df = pd.read_csv( DATA_PATH / '48000_cyberbullying_tweets_basic_clean.csv')

In [26]:
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [27]:
df['label'].value_counts()


label
religion     7942
age          7910
gender       7396
ethnicity    7319
other        6731
notcb        6377
Name: count, dtype: int64

In [28]:
df['label'] = df['label'].map({'age':0, 'gender':1, 'other':2, 'religion':3, 'ethnicity':4, 'notcb':5})
df.drop(df[df['label'] == 5].index, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df['label'].value_counts()


label
3    7942
0    7910
1    7396
4    7319
2    6731
Name: count, dtype: int64

In [29]:
x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, shuffle=True, random_state=115)

vect = TfidfVectorizer(ngram_range=(1,2))
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)


lr = LogisticRegression(solver='saga', penalty='l2', random_state=394)
lr.fit(x_train, y_train)

In [30]:
preds = lr.predict(x_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1637
           1       0.92      0.87      0.89      1480
           2       0.84      0.82      0.83      1311
           3       0.94      0.98      0.96      1571
           4       0.96      0.98      0.97      1461

    accuracy                           0.93      7460
   macro avg       0.93      0.93      0.93      7460
weighted avg       0.93      0.93      0.93      7460



Test a model trained on Dataset 1 (with relabeling) with each vectorizing strategy



In [57]:
# XGBClassifer Tests

# split dataset 3 ways 80/20, 70/30, 60/40

TEST_SIZES = [0.2,0.3,0.4]

# with open(EXPERIMENTS_PATH / 'params.json', 'r') as file:
#     params_list = json.load(file)['wo_re']

for size in TEST_SIZES:

    x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=size, shuffle=True, random_state=RANDOM_SEED)

    vect = TfidfVectorizer() # or CountVectorizer()

    x_train = vect.fit_transform(x_train)
    x_test = vect.transform(x_test)

    m = xgb.XGBClassifier()
    m.fit(x_train, y_train)
    y_pred = m.predict(x_test)
    result = classification_report(y_test, y_pred, output_dict=True)
    result['split'] = size
    result['model_name'] = 'XGBClassifier'
    result['model'] = m.get_params()
    result['vectorizer'] = vect.__class__.__name__
    result['vectorizer_params'] = {k: str(v) if isinstance(v, type) else v for k, v in vect.get_params().items()}
    append_results_to_json(result, RESULT_PATH / 'exp0_D1_1.json')



In [17]:
# CatBoostClassifier Tests

# split dataset 3 ways 80/20, 70/30, 60/40

TEST_SIZES = [0.2,0.3,0.4]
VECT = [CountVectorizer(), CountVectorizer(ngram_range=(1,2)), TfidfVectorizer(), TfidfVectorizer(ngram_range=(1,2))]

for size in TEST_SIZES:

    for vect in VECT:

        x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=size, shuffle=True, random_state=RANDOM_SEED)

        x_train = vect.fit_transform(x_train)
        x_test = vect.transform(x_test)

        train_pool = Pool(x_train, y_train)
        test_pool = Pool(x_test, y_test)

        m = CatBoostClassifier()
        m.fit(train_pool)
        preds = m.predict(test_pool)
        result = classification_report(y_test, preds, output_dict=True)
        result['test_split'] = size
        result['model_name'] = 'CatBoostClassifier'
        result['model_params'] = {k: str(v) if isinstance(v, type) else v for k, v in m.get_all_params().items()}
        result['vectorizer'] = vect.__class__.__name__
        result['vectorizer_params'] = {k: str(v) if isinstance(v, type) else v for k, v in vect.get_params().items()}

        append_results_to_json(result, RESULT_PATH / 'exp0_D1_1.json')

Learning rate set to 0.094849
0:	learn: 1.5846266	total: 278ms	remaining: 4m 37s
1:	learn: 1.4567791	total: 433ms	remaining: 3m 36s
2:	learn: 1.3715213	total: 555ms	remaining: 3m 4s
3:	learn: 1.2960426	total: 672ms	remaining: 2m 47s
4:	learn: 1.2346554	total: 789ms	remaining: 2m 36s
5:	learn: 1.1808531	total: 907ms	remaining: 2m 30s
6:	learn: 1.1392268	total: 1.03s	remaining: 2m 26s
7:	learn: 1.1027438	total: 1.15s	remaining: 2m 22s
8:	learn: 1.0706063	total: 1.27s	remaining: 2m 20s
9:	learn: 1.0404226	total: 1.41s	remaining: 2m 19s
10:	learn: 1.0132835	total: 1.53s	remaining: 2m 17s
11:	learn: 0.9913116	total: 1.65s	remaining: 2m 16s
12:	learn: 0.9645376	total: 1.79s	remaining: 2m 15s
13:	learn: 0.9440576	total: 1.93s	remaining: 2m 15s
14:	learn: 0.9240716	total: 2.06s	remaining: 2m 15s
15:	learn: 0.9050951	total: 2.19s	remaining: 2m 14s
16:	learn: 0.8876225	total: 2.31s	remaining: 2m 13s
17:	learn: 0.8720253	total: 2.42s	remaining: 2m 12s
18:	learn: 0.8575821	total: 2.55s	remaining: 

In [24]:
# get average accuracy for xgboost

import json

with open(RESULT_PATH / 'exp0_D1_1.json', 'r') as file:
    results = json.load(file)['results']

xgboost_results = [result['accuracy'] for result in results if result['model_name'] == 'XGBClassifier']

print(f'XGBClassifier Average Accuracy: {sum(xgboost_results) / len(xgboost_results)}')

XGBClassifier Average Accuracy: 0.8407607824714006


In [18]:
# get average accuracy for CatBoostClassifier
catboost_results = [result['accuracy'] for result in results if result['model_name'] == 'CatBoostClassifier']

print(f'CatBoostClassifier Average Accuracy: {sum(catboost_results) / len(catboost_results)}')


CatBoostClassifier Average Accuracy: 0.8415844008626236


In [8]:
# Get average recall of class 0 for all models

import json

with open(RESULT_PATH / 'exp0_D1_1.json', 'r') as file:
    results = json.load(file)['results']

class_0_recalls = [result['0']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
class_1_recalls = [result['1']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
class_2_recalls = [result['2']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
class_3_recalls = [result['3']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
class_4_recalls = [result['4']['recall'] for result in results if result['model_name'] == 'XGBClassifier']
class_5_recalls = [result['5']['recall'] for result in results if result['model_name'] == 'XGBClassifier']

average_recall_class_0 = sum(class_0_recalls) / len(class_0_recalls)
average_recall_class_1 = sum(class_1_recalls) / len(class_1_recalls)
average_recall_class_2 = sum(class_2_recalls) / len(class_2_recalls)
average_recall_class_3 = sum(class_3_recalls) / len(class_3_recalls)
average_recall_class_4 = sum(class_4_recalls) / len(class_4_recalls)
average_recall_class_5 = sum(class_5_recalls) / len(class_5_recalls)

overall_average_recall = average_recall_class_0\
    + average_recall_class_1\
    + average_recall_class_2\
    + average_recall_class_3\
    + average_recall_class_4\
    + average_recall_class_5

print(f'Overall Average Recall: {overall_average_recall / 6:.4f}')

Overall Average Recall: 0.8276


In [9]:
with open(RESULT_PATH / 'exp0_D1_1.json', 'r') as file:
    results = json.load(file)['results']

class_0_precisions = [result['0']['precision'] for result in results if result['model_name'] == 'XGBClassifier']
class_1_precisions = [result['1']['precision'] for result in results if result['model_name'] == 'XGBClassifier']
class_2_precisions = [result['2']['precision'] for result in results if result['model_name'] == 'XGBClassifier']
class_3_precisions = [result['3']['precision'] for result in results if result['model_name'] == 'XGBClassifier']
class_4_precisions = [result['4']['precision'] for result in results if result['model_name'] == 'XGBClassifier']
class_5_precisions = [result['5']['precision'] for result in results if result['model_name'] == 'XGBClassifier']

average_precision_class_0 = sum(class_0_precisions) / len(class_0_precisions)
average_precision_class_1 = sum(class_1_precisions) / len(class_1_precisions)
average_precision_class_2 = sum(class_2_precisions) / len(class_2_precisions)
average_precision_class_3 = sum(class_3_precisions) / len(class_3_precisions)
average_precision_class_4 = sum(class_4_precisions) / len(class_4_precisions)
average_precision_class_5 = sum(class_5_precisions) / len(class_5_precisions)

overall_average_precision = average_precision_class_0 \
    + average_precision_class_1 \
    + average_precision_class_2 \
    + average_precision_class_3 \
    + average_precision_class_4 \
    + average_precision_class_5

print(f'Overall Average Precision: {overall_average_precision / 6:.4f}')

Overall Average Precision: 0.8364


In [19]:
with open(RESULT_PATH / 'exp0_D1_1.json', 'r') as file:
    results = json.load(file)['results']

class_0_recalls = [result['0']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_1_recalls = [result['1']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_2_recalls = [result['2']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_3_recalls = [result['3']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_4_recalls = [result['4']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_5_recalls = [result['5']['recall'] for result in results if result['model_name'] == 'CatBoostClassifier']

average_recall_class_0 = sum(class_0_recalls) / len(class_0_recalls)
average_recall_class_1 = sum(class_1_recalls) / len(class_1_recalls)
average_recall_class_2 = sum(class_2_recalls) / len(class_2_recalls)
average_recall_class_3 = sum(class_3_recalls) / len(class_3_recalls)
average_recall_class_4 = sum(class_4_recalls) / len(class_4_recalls)
average_recall_class_5 = sum(class_5_recalls) / len(class_5_recalls)

overall_average_recall = average_recall_class_0\
    + average_recall_class_1\
    + average_recall_class_2\
    + average_recall_class_3\
    + average_recall_class_4\
    + average_recall_class_5

print(f'CatBoost: Overall Average Recall: {overall_average_recall / 6:.4f}')

CatBoost: Overall Average Recall: 0.8255


In [20]:
with open(RESULT_PATH / 'exp0_D1_1.json', 'r') as file:
    results = json.load(file)['results']

class_0_precisions = [result['0']['precision'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_1_precisions = [result['1']['precision'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_2_precisions = [result['2']['precision'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_3_precisions = [result['3']['precision'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_4_precisions = [result['4']['precision'] for result in results if result['model_name'] == 'CatBoostClassifier']
class_5_precisions = [result['5']['precision'] for result in results if result['model_name'] == 'CatBoostClassifier']

average_precision_class_0 = sum(class_0_precisions) / len(class_0_precisions)
average_precision_class_1 = sum(class_1_precisions) / len(class_1_precisions)
average_precision_class_2 = sum(class_2_precisions) / len(class_2_precisions)
average_precision_class_3 = sum(class_3_precisions) / len(class_3_precisions)
average_precision_class_4 = sum(class_4_precisions) / len(class_4_precisions)
average_precision_class_5 = sum(class_5_precisions) / len(class_5_precisions)

overall_average_precision = average_precision_class_0 \
    + average_precision_class_1 \
    + average_precision_class_2 \
    + average_precision_class_3 \
    + average_precision_class_4 \
    + average_precision_class_5

print(f'CatBoost: Overall Average Precision: {overall_average_precision / 6:.4f}')


CatBoost: Overall Average Precision: 0.8466
