In [84]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, ShuffleSplit, cross_validate
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer


import numpy as np
import pandas as pd

from pathlib import Path
import random
import itertools

import sys
sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\')
from src.utils.results import create_results_file, append_results_to_json
from src.utils.data import balance_train_and_test, get_OOV_feats

In [2]:
RANDOM_SEED = 115
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [3]:
CLEAN_DATA_PATH = Path("../data/en_only")
EXPERIMENTS_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments')
RESULT_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments\\results\\exp3')

TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv' 
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'

In [4]:
train_data_path = CLEAN_DATA_PATH / TRAIN_DATA_NAME
test_data_path = CLEAN_DATA_PATH / TEST_DATA_NAME

### Find Semi-Optimal Hyperparameters 

AUC of 0.5-0.7 (BAD), 0.7-0.8 (POOR), 0.8-0.9 (GOOD), 0.9-1.0 (EXCELLENT) 

In [5]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# remove duplicates
train_df = train_df.drop_duplicates()
# remove rows with missing values
train_df = train_df.dropna()

train_df.reset_index(drop=True, inplace=True)

In [6]:
train_df['label'] = train_df['label'].apply(lambda label : 0 if label == 'notcb' else 1)

label2id = {'notcb': 0, 'cb': 1}
id2label = {0: 'notcb', 1: 'cb'}

In [79]:
# split data into train and validation sets
x_train, x_val, y_train, y_val = train_test_split(train_df['tweet'], train_df['label'], test_size=0.05, shuffle=True, random_state=RANDOM_SEED)

In [13]:
snowballer = SnowballStemmer('english')
porter = PorterStemmer()

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])

def pp_PorterStemmer(text):
    return ' '.join([porter.stem(word) for word in text.split()])

In [14]:
xgb_pipeline = Pipeline([
    ('vectorizer', 'passthrough'),
    ('classifier', XGBClassifier())
])

xgb_param_grid = {
    'vectorizer': [CountVectorizer, TfidfVectorizer],
    'vectorizer__ngram_range': [(1,1), (1,3)],
    'vectorizer__preprocessor': [pp_SnowballStemmer],
    'vectorizer__max_df': [0.5, 0.75],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [500, 1000],
    'classifier__colsample_bytree': [0.5, 0.75],
    'classifier__max_depth': [6, 12]
}

In [76]:
iters = (v for _, v in xgb_param_grid.items())
keys = xgb_param_grid.keys()
param_sets = [params for params in itertools.product(*iters)]

In [83]:
# calculate the scale_pos_weight
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f'The positive weight is {pos_weight:.4f}')

The positive weight is 0.1717


In [75]:
for params in param_sets:
    t = {k: v for k, v in zip(keys, params)}

    vect_params = {}
    classifier_params = {}

    for k in keys:
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]
            vect_params[p_name] = t[k]
        if 'classifier__' in k:
            p_name = k.split('__')[1]
            classifier_params[p_name] = t[k]

    vect = t['vectorizer'](**vect_params)
    classifier = t['classifier'](early_stopping_rounds=10, eval_metric='logloss',scale_pos_weight=pos_weight, **classifier_params)

    x_train_copy = vect.fit_transform(x_train)
    x_val_copy = vect.transform(x_val)
    y_train_copy = y_train
    y_val_copy = y_val

    ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=RANDOM_SEED)
    cv_results = cross_validate(classifier, x_train_copy, y_train_copy, cv=ss, scoring=['roc_auc', 'f1-macro'], refit=False, params=([x_val_copy, y_val_copy]), n_jobs=-1)
    print(f'The cv results are {cv_results}')


[(CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  500,
  0.5,
  6),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  500,
  0.5,
  12),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  500,
  0.75,
  6),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  500,
  0.75,
  12),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  1000,
  0.5,
  6),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  1000,
  0.5,
  12),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  1000,
  0.75,
  6),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.01,
  1000,
  0.75,
  12),
 (CountVectorizer(),
  (1, 1),
  <function __main__.pp_PorterStemmer(text)>,
  0.5,
  0.1,
 

In [27]:
est = xgb_gs.fit(x_train, y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
[CV 1/5] END classifier__colsample_bytree=0.5, classifier__gamma=0, classifier__learning_rate=0.01, classifier__n_estimators=100, vectorizer=CountVectorizer(), vectorizer__max_df=0.5, vectorizer__ngram_range=(1, 1), vectorizer__preprocessor=<function pp_PorterStemmer at 0x000002DA16C64550>;, score=0.881 total time=   8.2s


KeyboardInterrupt: 

In [28]:
top_25 = 

SyntaxError: invalid syntax (1305662073.py, line 1)