In [7]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, ShuffleSplit, cross_validate
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

import numpy as np
import pandas as pd

from pathlib import Path
import random
import itertools
import json

import sys
sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\')
from src.utils.results import create_results_file, append_results_to_json
from src.utils.data import balance_train_and_test, get_OOV_feats

In [8]:
RANDOM_SEED = 115
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [9]:
CLEAN_DATA_PATH = Path("../data/en_only")
EXPERIMENTS_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments')
RESULT_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments\\results\\exp3')

TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv' 
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'

In [10]:
train_data_path = CLEAN_DATA_PATH / TRAIN_DATA_NAME
test_data_path = CLEAN_DATA_PATH / TEST_DATA_NAME

### Find Semi-Optimal Hyperparameters 

AUC of 0.5-0.7 (BAD), 0.7-0.8 (POOR), 0.8-0.9 (GOOD), 0.9-1.0 (EXCELLENT) 

In [11]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# remove duplicates
train_df = train_df.drop_duplicates()
# remove rows with missing values
train_df = train_df.dropna()

train_df.reset_index(drop=True, inplace=True)

In [12]:
train_df['label'] = train_df['label'].apply(lambda label : 0 if label == 'notcb' else 1)

label2id = {'notcb': 0, 'cb': 1}
id2label = {0: 'notcb', 1: 'cb'}

In [13]:
# split data into train and validation sets
x_train, x_val, y_train, y_val = train_test_split(train_df['tweet'], train_df['label'], test_size=0.05, shuffle=True, random_state=RANDOM_SEED)

In [14]:
snowballer = SnowballStemmer('english')
porter = PorterStemmer()

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])

def pp_PorterStemmer(text):
    return ' '.join([porter.stem(word) for word in text.split()])

In [15]:
xgb_param_grid = {
    'vectorizer': [CountVectorizer, TfidfVectorizer],
    'vectorizer__ngram_range': [(1,1), (1,3)],
    'vectorizer__preprocessor': [pp_SnowballStemmer],
    'vectorizer__max_df': [0.5, 0.75],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 500],
    'classifier__colsample_bytree': [0.5, 0.75],
    'classifier__max_depth': [6, 12],
    'classifier': [XGBClassifier]
}

In [16]:
iters = (v for _, v in xgb_param_grid.items())
keys = xgb_param_grid.keys()
param_sets = [params for params in itertools.product(*iters)]

In [17]:
# calculate the scale_pos_weight
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f'The positive weight is {pos_weight:.4f}')

The positive weight is 0.1715


In [22]:
all_results = {'fit_time': [], 'score_time': [], 'test_f1_macro': [], 'test_f1_weighted': [], 'params': []}

N_SPLITS = 5

for idx, params in enumerate(param_sets):
    print(f'Starting run {idx}...\n')

    t = {k: v for k, v in zip(keys, params)}

    vect_params = {}
    classifier_params = {}

    for k in keys:
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]
            vect_params[p_name] = t[k]
        if 'classifier__' in k:
            p_name = k.split('__')[1]
            classifier_params[p_name] = t[k]

    vect = t['vectorizer'](**vect_params)
    classifier = t['classifier'](early_stopping_rounds=7, eval_metric='logloss', scale_pos_weight=pos_weight, **classifier_params)

    x_train_copy = vect.fit_transform(x_train)
    x_val_copy = vect.transform(x_val)
    y_train_copy = y_train
    y_val_copy = y_val

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

    cv_results: dict = cross_validate(
        classifier,
        x_train_copy,
        y_train_copy,
        cv=skf,
        scoring=['f1_macro', 'f1_weighted'],
        verbose=10,
        return_estimator=False,
        params={'eval_set':[(x_val_copy, y_val_copy)]},
        n_jobs=-1
    )

    print(f'The cv results for run {idx} are:\n{pd.DataFrame(cv_results)}\n')

    cv_results['params'] = [t for _ in range(N_SPLITS)]
    for key, values in cv_results.items():
        all_results[key].extend(values)




Starting run 0...



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   38.4s remaining:   57.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   38.4s remaining:   25.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.6s finished


The cv results for run 0 are:
    fit_time  score_time  test_f1_macro  test_f1_weighted
0  32.812652    0.042157       0.600689          0.707024
1  32.807300    0.043509       0.597585          0.703347
2  32.677444    0.046913       0.589113          0.694266
3  32.844946    0.042316       0.597445          0.703624
4  33.048005    0.030066       0.607920          0.716048

Starting run 1...



KeyboardInterrupt: 

In [27]:
with open('cv_results_xgb_d1.json', 'w+') as f:
    json.dump(all_results, f, default=str)