In [1]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk.stem.snowball import SnowballStemmer

import pandas as pd
import numpy as np

import random
import sys
from pathlib import Path

sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\cyberbullyingml\\cyberbullying-ml\\official\\src\\utils\\')

from params import get_topn_param_sets
from results import append_results, create_results_file



In [2]:
DATA_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\cyberbullyingml\\cyberbullying-ml\\data\\en_only')
RESULT_PATH = Path.cwd()

TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv'
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'

RANDOM_SEED = 115
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [3]:
train_df = pd.read_csv(DATA_PATH / TRAIN_DATA_NAME)
test_df = pd.read_csv(DATA_PATH / TEST_DATA_NAME)

train_df.dropna(axis=0, inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df.dropna(axis=0, inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [4]:
train_df['label'] = train_df['label'].apply(lambda label: 0 if label =='notcb' else 1)
test_df['class'] = test_df['class'].apply(lambda label: 0 if label == 0 else 1)

In [5]:
x_train, x_val, y_train, y_val = train_test_split(
    train_df['tweet'], 
    train_df['label'], 
    test_size=0.1, 
    random_state=RANDOM_SEED
) # for early stopping

x_test = test_df['tweet']
y_test = test_df['class']

In [6]:
catb_param_results = get_topn_param_sets(algo='catb', dataset='d1', n=10, sort_condition='f1_macro_mean')
xgb_param_results = get_topn_param_sets(algo='xgb', dataset='d1', n=10, sort_condition='f1_macro_mean')

catb_param_sets = catb_param_results['params']
xgb_param_sets = xgb_param_results['params']

neg_to_pos_ratio = (train_df['label'] == 0).sum() / (train_df['label'] == 1).sum() 

In [7]:
snowballer = SnowballStemmer('english')

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])

In [8]:
results_file = RESULT_PATH/'exp1_catb_results.json'
create_results_file(results_file)

for i, params in enumerate(catb_param_sets):
    print(f'Starting run {i+1}...')

    vectorizer_cls = CountVectorizer if params['vectorizer'] ==  "<class 'sklearn.feature_extraction.text.CountVectorizer'>" else TfidfVectorizer
    classifier_cls = CatBoostClassifier

    vect_params = {}
    classifier_params = {}

    for k in params.keys():
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]            
            if p_name == 'ngram_range':
                vect_params[p_name] = tuple(params[k])  
            elif p_name == 'preprocessor':
                vect_params[p_name] = pp_SnowballStemmer if isinstance(params[k], str) else None
            else:
                vect_params[p_name] = params[k]
            
        if 'classifier__' in k:
            p_name = k.split('__')[1]
            classifier_params[p_name] = params[k]

    vectorizer = vectorizer_cls(**vect_params)
    classifier = classifier_cls(**classifier_params, scale_pos_weight=neg_to_pos_ratio, early_stopping_rounds=10, eval_metric='Logloss')

    x_train_transformed = vectorizer.fit_transform(x_train)
    x_val_transformed = vectorizer.transform(x_val)
    x_test_transformed = vectorizer.transform(x_test)

    classifier.fit(x_train_transformed, y_train, eval_set=[(x_val_transformed, y_val)])
    preds = classifier.predict(x_test_transformed)

    val_scores = catb_param_results.iloc[i]
    
    result = {}
    result['report'] = (report:=classification_report(y_test, preds, output_dict=True))
    classifier_params['n_estimators'] = classifier.get_param('n_estimators')
    result['classifier_params'] = classifier_params
    result['vectorizer_params'] = vect_params
    result['val_f1_macro_mean'] = val_f1_macro_mean = val_scores['f1_macro_mean']
    result['val_f1_weighted_mean'] = val_f1_weighted_mean = val_scores['f1_weighted_mean']
    result['drop_in_f1_macro_mean'] = val_f1_macro_mean - report['macro avg']['f1-score']
    result['drop_in_f1_weighed_mean'] = val_f1_weighted_mean - report['weighted avg']['f1-score']
    
    append_results(result, results_file)

Results file already exists. Not creating new one
Starting run 1...
0:	learn: 0.6393071	test: 0.6376837	best: 0.6376837 (0)	total: 406ms	remaining: 6m 45s
1:	learn: 0.5973573	test: 0.5940863	best: 0.5940863 (1)	total: 671ms	remaining: 5m 34s
2:	learn: 0.5655698	test: 0.5620637	best: 0.5620637 (2)	total: 936ms	remaining: 5m 11s
3:	learn: 0.5409992	test: 0.5381645	best: 0.5381645 (3)	total: 1.2s	remaining: 4m 58s
4:	learn: 0.5207994	test: 0.5176550	best: 0.5176550 (4)	total: 1.47s	remaining: 4m 51s
5:	learn: 0.5018512	test: 0.4979294	best: 0.4979294 (5)	total: 1.72s	remaining: 4m 45s
6:	learn: 0.4864174	test: 0.4825861	best: 0.4825861 (6)	total: 1.98s	remaining: 4m 41s
7:	learn: 0.4726589	test: 0.4679495	best: 0.4679495 (7)	total: 2.24s	remaining: 4m 38s
8:	learn: 0.4638906	test: 0.4592282	best: 0.4592282 (8)	total: 2.5s	remaining: 4m 35s
9:	learn: 0.4535680	test: 0.4491060	best: 0.4491060 (9)	total: 2.77s	remaining: 4m 33s
10:	learn: 0.4458145	test: 0.4417897	best: 0.4417897 (10)	total:

KeyboardInterrupt: 

In [8]:
results_file = RESULT_PATH/'exp1_xgb_results.json'
create_results_file(results_file)

for i, params in enumerate(xgb_param_sets):
    print(f'Starting run {i+1}...')

    vectorizer_cls = CountVectorizer if params['vectorizer'] ==  "<class 'sklearn.feature_extraction.text.CountVectorizer'>" else TfidfVectorizer
    classifier_cls = XGBClassifier

    vect_params = {}
    classifier_params = {}

    for k in params.keys():
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]            
            if p_name == 'ngram_range':
                vect_params[p_name] = tuple(params[k])  
            elif p_name == 'preprocessor':
                vect_params[p_name] = pp_SnowballStemmer if isinstance(params[k], str) else None
            else:
                vect_params[p_name] = params[k]
            
        if 'classifier__' in k:
            p_name = k.split('__')[1]
            classifier_params[p_name] = params[k]

    vectorizer = vectorizer_cls(**vect_params)
    classifier = classifier_cls(**classifier_params, scale_pos_weight=neg_to_pos_ratio, early_stopping_rounds=10, eval_metric='logloss')

    x_train_transformed = vectorizer.fit_transform(x_train)
    x_val_transformed = vectorizer.transform(x_val)
    x_test_transformed = vectorizer.transform(x_test)

    classifier.fit(x_train_transformed, y_train, eval_set=[(x_val_transformed, y_val)])
    preds = classifier.predict(x_test_transformed)

    val_scores = xgb_param_results.iloc[i]
    
    result = {}
    result['report'] = (report:=classification_report(y_test, preds, output_dict=True))
    classifier_params['n_estimators'] = classifier.get_params()['n_estimators']
    result['classifier_params'] = classifier_params
    result['vectorizer_params'] = vect_params
    result['val_f1_macro_mean'] = val_f1_macro_mean = val_scores['f1_macro_mean']
    result['val_f1_weighted_mean'] = val_f1_weighted_mean = val_scores['f1_weighted_mean']
    result['drop_in_f1_macro_mean'] = val_f1_macro_mean - report['macro avg']['f1-score']
    result['drop_in_f1_weighed_mean'] = val_f1_weighted_mean - report['weighted avg']['f1-score']
    
    append_results(result, results_file)

Results file already exists. Not creating new one
Starting run 1...
[0]	validation_0-logloss:0.65464
[1]	validation_0-logloss:0.62128
[2]	validation_0-logloss:0.59697
[3]	validation_0-logloss:0.57389
[4]	validation_0-logloss:0.55675
[5]	validation_0-logloss:0.54214
[6]	validation_0-logloss:0.52811
[7]	validation_0-logloss:0.51618
[8]	validation_0-logloss:0.50365
[9]	validation_0-logloss:0.49398
[10]	validation_0-logloss:0.48564
[11]	validation_0-logloss:0.47889
[12]	validation_0-logloss:0.47100
[13]	validation_0-logloss:0.46448
[14]	validation_0-logloss:0.45770
[15]	validation_0-logloss:0.45187
[16]	validation_0-logloss:0.44702
[17]	validation_0-logloss:0.44297
[18]	validation_0-logloss:0.43833
[19]	validation_0-logloss:0.43384
[20]	validation_0-logloss:0.43068
[21]	validation_0-logloss:0.42710
[22]	validation_0-logloss:0.42385
[23]	validation_0-logloss:0.42055
[24]	validation_0-logloss:0.41762
[25]	validation_0-logloss:0.41414
[26]	validation_0-logloss:0.41192
[27]	validation_0-loglos

TypeError: Object of type function is not JSON serializable