In [2]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk.stem.snowball import SnowballStemmer

import pandas as pd
import numpy as np

import random
import sys
from pathlib import Path

sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\cyberbullyingml\\cyberbullying-ml\\official\\src\\utils\\')

from params import get_topn_param_sets
from results import append_results, create_results_file

In [3]:
DATA_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\cyberbullyingml\\cyberbullying-ml\\data\\en_only')
RESULT_PATH = Path.cwd()
PATH_TO_PARAMS =  Path(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\official\hyp_search\results')

TRAIN_DATA_NAME = 'onlineHarassmentDataset_basic_clean.csv'
TEST_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv'

RANDOM_SEED = 115
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [4]:
train_df = pd.read_csv(DATA_PATH / TRAIN_DATA_NAME)
test_df = pd.read_csv(DATA_PATH / TEST_DATA_NAME)

train_df.dropna(axis=0, inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df.dropna(axis=0, inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [5]:
train_df['Code'] = train_df['Code'].apply(lambda label: 0 if label == 'N' else 1)
test_df['label'] =  test_df['label'].apply(lambda label: 0 if label == 'notcb' else 1)

In [6]:
catb_param_results:pd.DataFrame = get_topn_param_sets(PATH_TO_PARAMS, algo='catb', dataset='d3', n=10, sort_condition='f1_macro_mean')

catb_param_sets = catb_param_results['params']

neg_to_pos_ratio = (train_df['Code'] == 0).sum() / (train_df['Code'] == 1).sum() 

print(f'Negative to positive ratio: {neg_to_pos_ratio}')

Negative to positive ratio: 2.905630134241635


In [7]:
catb_param_results.head()

Unnamed: 0,fit_time_mean,fit_time_std_dev,f1_macro_mean,f1_macro_std_dev,f1_weighted_mean,f1_weighted_std_dev,params
52,52.904682,2.745694,0.615708,0.013114,0.708859,0.008832,{'vectorizer': '<class 'sklearn.feature_extrac...
36,35.698422,3.244649,0.613851,0.008136,0.705908,0.005977,{'vectorizer': '<class 'sklearn.feature_extrac...
12,4.763915,1.508792,0.613478,0.007952,0.704033,0.0061,{'vectorizer': '<class 'sklearn.feature_extrac...
30,4.320346,0.805462,0.613366,0.006159,0.704986,0.006288,{'vectorizer': '<class 'sklearn.feature_extrac...
44,32.661887,6.609207,0.613299,0.005301,0.705349,0.005677,{'vectorizer': '<class 'sklearn.feature_extrac...


In [9]:
x_train, x_val, y_train, y_val = train_test_split(
    train_df['Tweet'], 
    train_df['Code'], 
    test_size=0.1, 
    random_state=RANDOM_SEED
) # for early stopping

x_test = test_df['tweet']
y_test = test_df['label']

In [10]:
snowballer = SnowballStemmer('english')

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])

In [11]:
results_file = RESULT_PATH/'exp1_catb_results.json'
create_results_file(results_file)

for i, params in enumerate(catb_param_sets):
    print(f'Starting run {i+1}...')

    vectorizer_cls = CountVectorizer if params['vectorizer'] ==  "<class 'sklearn.feature_extraction.text.CountVectorizer'>" else TfidfVectorizer
    classifier_cls = CatBoostClassifier

    vect_params = {}
    classifier_params = {}

    for k in params.keys():
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]            
            if p_name == 'ngram_range':
                vect_params[p_name] = tuple(params[k])  
            elif p_name == 'preprocessor':
                vect_params[p_name] = pp_SnowballStemmer if isinstance(params[k], str) else None
            else:
                vect_params[p_name] = params[k]
            
        if 'classifier__' in k:
            p_name = k.split('__')[1]
            classifier_params[p_name] = params[k]

    vectorizer = vectorizer_cls(**vect_params)
    classifier = classifier_cls(**classifier_params, scale_pos_weight=neg_to_pos_ratio, early_stopping_rounds=10, eval_metric='Logloss')

    x_train_transformed = vectorizer.fit_transform(x_train)
    x_val_transformed = vectorizer.transform(x_val)
    x_test_transformed = vectorizer.transform(x_test)

    classifier.fit(x_train_transformed, y_train, eval_set=[(x_val_transformed, y_val)])
    preds = classifier.predict(x_test_transformed)

    val_scores = catb_param_results.iloc[i]
    
    result = {}
    result['report'] = (report:=classification_report(y_test, preds, output_dict=True))
    classifier_params['n_estimators'] = classifier.get_param('n_estimators')
    result['classifier_params'] = classifier_params
    result['vectorizer_params'] = vect_params
    result['val_f1_macro_mean'] = val_f1_macro_mean = val_scores['f1_macro_mean']
    result['val_f1_weighted_mean'] = val_f1_weighted_mean = val_scores['f1_weighted_mean']
    result['drop_in_f1_macro_mean'] = val_f1_macro_mean - report['macro avg']['f1-score']
    result['drop_in_f1_weighed_mean'] = val_f1_weighted_mean - report['weighted avg']['f1-score']
    
    append_results(result, results_file)

Starting run 1...
0:	learn: 0.6882005	test: 0.6881863	best: 0.6881863 (0)	total: 643ms	remaining: 10m 42s
1:	learn: 0.6848876	test: 0.6845728	best: 0.6845728 (1)	total: 1.17s	remaining: 9m 41s
2:	learn: 0.6816043	test: 0.6811764	best: 0.6811764 (2)	total: 1.66s	remaining: 9m 10s
3:	learn: 0.6793686	test: 0.6786413	best: 0.6786413 (3)	total: 2.17s	remaining: 8m 59s
4:	learn: 0.6777844	test: 0.6773273	best: 0.6773273 (4)	total: 2.66s	remaining: 8m 48s
5:	learn: 0.6763627	test: 0.6758237	best: 0.6758237 (5)	total: 3.15s	remaining: 8m 41s
6:	learn: 0.6756839	test: 0.6753501	best: 0.6753501 (6)	total: 3.66s	remaining: 8m 39s
7:	learn: 0.6751845	test: 0.6751363	best: 0.6751363 (7)	total: 4.15s	remaining: 8m 34s
8:	learn: 0.6747488	test: 0.6747810	best: 0.6747810 (8)	total: 4.64s	remaining: 8m 30s
9:	learn: 0.6734810	test: 0.6733525	best: 0.6733525 (9)	total: 5.16s	remaining: 8m 30s
10:	learn: 0.6730300	test: 0.6730713	best: 0.6730713 (10)	total: 5.68s	remaining: 8m 30s
11:	learn: 0.6721282	t

KeyboardInterrupt: 

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    train_df['Tweet'], 
    train_df['Code'], 
    test_size=0.3,
    stratify=train_df['Code'] ,
    random_state=RANDOM_SEED
) # for early stopping

x_test, x_val, y_test, y_val = train_test_split(
    x_test,
    y_test,
    test_size=1/3,
    stratify=y_test,
    random_state=RANDOM_SEED
)

In [18]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum() 

print(f'Negative to positive ratio: {neg_to_pos_ratio}')

Negative to positive ratio: 2.905266170578134


In [19]:
snowballer = SnowballStemmer('english')

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])

In [20]:
results_file = RESULT_PATH/'catb_intra_dataset_results.json'
create_results_file(results_file)

for i, params in enumerate(catb_param_sets):
    print(f'Starting run {i+1}...')

    vectorizer_cls = CountVectorizer if params['vectorizer'] ==  "<class 'sklearn.feature_extraction.text.CountVectorizer'>" else TfidfVectorizer
    classifier_cls = CatBoostClassifier

    vect_params = {}
    classifier_params = {}

    for k in params.keys():
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]            
            if p_name == 'ngram_range':
                vect_params[p_name] = tuple(params[k])  
            elif p_name == 'preprocessor':
                vect_params[p_name] = pp_SnowballStemmer if isinstance(params[k], str) else None
            else:
                vect_params[p_name] = params[k]
            
        if 'classifier__' in k:
            p_name = k.split('__')[1]
            classifier_params[p_name] = params[k]

    vectorizer = vectorizer_cls(**vect_params)
    classifier = classifier_cls(**classifier_params, scale_pos_weight=neg_to_pos_ratio, early_stopping_rounds=10, eval_metric='Logloss')

    x_train_transformed = vectorizer.fit_transform(x_train)
    x_val_transformed = vectorizer.transform(x_val)
    x_test_transformed = vectorizer.transform(x_test)

    classifier.fit(x_train_transformed, y_train, eval_set=[(x_val_transformed, y_val)])
    preds = classifier.predict(x_test_transformed)

    val_scores = catb_param_results.iloc[i]
    
    result = {}
    result['report'] = (report:=classification_report(y_test, preds, output_dict=True))
    classifier_params['n_estimators'] = classifier.get_param('n_estimators')
    result['classifier_params'] = classifier_params
    result['vectorizer_params'] = vect_params
    
    append_results(result, results_file)

Starting run 1...
0:	learn: 0.6888640	test: 0.6877489	best: 0.6877489 (0)	total: 405ms	remaining: 6m 44s
1:	learn: 0.6855687	test: 0.6835996	best: 0.6835996 (1)	total: 825ms	remaining: 6m 51s
2:	learn: 0.6817622	test: 0.6790500	best: 0.6790500 (2)	total: 1.2s	remaining: 6m 39s
3:	learn: 0.6797387	test: 0.6765750	best: 0.6765750 (3)	total: 1.58s	remaining: 6m 33s
4:	learn: 0.6772332	test: 0.6736835	best: 0.6736835 (4)	total: 1.96s	remaining: 6m 30s
5:	learn: 0.6757403	test: 0.6723740	best: 0.6723740 (5)	total: 2.39s	remaining: 6m 36s
6:	learn: 0.6747591	test: 0.6717045	best: 0.6717045 (6)	total: 2.78s	remaining: 6m 35s
7:	learn: 0.6736288	test: 0.6705574	best: 0.6705574 (7)	total: 3.17s	remaining: 6m 32s
8:	learn: 0.6723642	test: 0.6696218	best: 0.6696218 (8)	total: 3.55s	remaining: 6m 31s
9:	learn: 0.6715295	test: 0.6694583	best: 0.6694583 (9)	total: 3.94s	remaining: 6m 30s
10:	learn: 0.6706696	test: 0.6688645	best: 0.6688645 (10)	total: 4.46s	remaining: 6m 41s
11:	learn: 0.6696465	tes