In [1]:
import numpy as np
from scipy.stats import rv_continuous

In [43]:
class CustomDistribution(rv_continuous):
    def rvs(self, *args, **kwargs):
        if np.random.rand() < 0.5:
            return None
        else:
            return np.random.normal(*args, **kwargs)

custom_dist = CustomDistribution(name='custom_dist')

In [56]:

custom_dist.rvs(size=10,loc=2,scale=0.1)


array([2.04302104, 1.99185057, 2.01461519, 1.97196757, 1.82350311,
       2.03865219, 2.00428072, 2.05048621, 1.99431688, 1.96660916])

In [1]:
import numpy as np
from scipy.stats import rv_continuous

class RandomRatio(rv_continuous):
    def __init__(self, ratio, *args, **kwargs):
        self.low = kwargs.pop('low', 0)
        self.high = kwargs.pop('high', 1)
        self.ratio = ratio
        super().__init__(*args, **kwargs)

    def rvs(self, *args, **kwargs):
        if np.random.rand() < 0.1:
            return None
        else:
            return self.ratio + np.random.uniform(self.low, self.high, *args, **kwargs)

custom_dist = RandomRatio(ratio=0.5, low=-.5, high=.5)


In [2]:
import pandas as pd

def balance_train_and_test(notcb_train:pd.DataFrame, cb_train:pd.DataFrame, notcb_test:pd.DataFrame, cb_test:pd.DataFrame, random_state:int):
    """
    This functions does not relabel the data that is moved!
    """
    x1 = notcb_train.shape[0]
    x2 = cb_train.shape[0]
    y1 = notcb_test.shape[0]
    y2 = cb_test.shape[0]

    x_hat = (y2 * x1 - x2 * y1) / (y2 + x2)

    
    if x_hat < 0: # take from the test and add to training
        np.testing.assert_allclose(ratio:=(y1 + x_hat)/(y2), (x1 - x_hat)/(x2), atol=1e-5)
        notcb_test = notcb_test.sample(frac=1, random_state=random_state)
        x_hat *= -1 
        data_to_move = notcb_test[:int(x_hat)]
        notcb_test = notcb_test[int(x_hat):]
        train_df = pd.concat([notcb_train, cb_train, data_to_move])
        test_df = pd.concat([notcb_test, cb_test])

    else: # take from training, add to test
        np.testing.assert_allclose(ratio:=(y1 + x_hat)/(y2), (x1 - x_hat)/(x2), atol=1e-5)
        notcb_train = notcb_train.sample(frac=1, random_state=random_state)
        data_to_move = notcb_train[:int(x_hat)]
        notcb_train = notcb_train[int(x_hat):]
        train_df = pd.concat([notcb_train, cb_train])
        test_df = pd.concat([notcb_test, cb_test, data_to_move])
    
    return train_df, test_df, x_hat, ratio # the new ratio of noncb to cb (in both sets)

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

from regex import B

path_to_data = Path('C:/Users/rooty/UWEC/Research/CyberBullyingML/venv/cyberbullying-ml/data/en_only')
TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv' 
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'
RANDOM_SEED = 115

train_df = pd.read_csv(path_to_data / TRAIN_DATA_NAME)
test_df = pd.read_csv(path_to_data / TEST_DATA_NAME)
train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df.rename(columns={'class': 'label'}, inplace=True)
# relabel the data such that the offensive class is cyberbullying and the nonoffensive class is not cyberbullying (the hatespeech class is not included)
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'notcb' else 1)
test_df['label'] = test_df['label'].map({0: 2, 1: 1, 2: 0})
# Balance the datasets
notcb_train = train_df[train_df['label'] == 0]
cb_train = train_df[train_df['label'] == 1]
notcb_test = test_df[test_df['label'] == 0]
cb_test = test_df[test_df['label'] == 1]
balanced_train, balanced_test, x_hat, ratio = balance_train_and_test(notcb_train, cb_train, notcb_test, cb_test, random_state=RANDOM_SEED)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def pp(text):
    return ''.join([char for char in text if ord(char) < 128])

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1,3), preprocessor=pp)
cv.fit(balanced_train['tweet'])
x_train = cv.transform(balanced_train['tweet'])
x_test = cv.transform(balanced_test['tweet'])
y_train = balanced_train['label']
y_test = balanced_test['label']

In [15]:
cv.get_feature_names_out()

array([' ', ' 0', ' 0 ', ..., 'zzw', 'zzy', 'zzz'], dtype=object)

In [16]:
import xgboost as xgb

m = xgb.XGBClassifier()
m.fit(x_train, y_train)

In [17]:
from sklearn.metrics import classification_report

# Perform predictions on x_test
y_pred = m.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.63      0.61      0.62      3243
           1       0.93      0.93      0.93     17476

    accuracy                           0.88     20719
   macro avg       0.78      0.77      0.77     20719
weighted avg       0.88      0.88      0.88     20719



In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from xgboost import XGBClassifier
import itertools


snowballer = SnowballStemmer('english')

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])


In [35]:


xgb_param_grid = {
    'vectorizer': [CountVectorizer],
    'vectorizer__ngram_range': [(1,1), (1,3)],
    'vectorizer__preprocessor': [pp_SnowballStemmer, None],
    'vectorizer__max_df': [0.5, 0.75],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 500],
    'classifier__colsample_bytree': [0.5, 0.75],
    'classifier__max_depth': [6, 12],
    'classifier': [XGBClassifier]
}
iters = (v for _, v in xgb_param_grid.items())
keys = xgb_param_grid.keys()
param_sets = [params for params in itertools.product(*iters)]

In [36]:
# best: 47, 31, 77
len(param_sets)

128

In [2]:
import pandas as pd
df = pd.read_csv('C:/Users/rooty/UWEC/Research/CyberBullyingML/cyberbullyingml/cyberbullying-ml/data/en_only/48000_cyberbullying_tweets_basic_clean.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df['label'] = df['label'].apply(lambda x: 0 if x == 'notcb' else 1)
df.head()


Unnamed: 0,tweet,label
0,Here at home Neighbors pick on my family and I...,1
1,Being bullied at school Highachieving boys use...,1
2,There was a girl in my class in 6th grade who ...,1
3,He is probably a white gay kid from some subur...,1
4,You are pushed ti resorting Treating thr bulli...,1


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1,3), max_df=0.9)


In [7]:
x = cv.fit_transform(df['tweet'])
y = df['label']

In [1]:
from sklearn.base import clone
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_validate

import xgboost as xgb

X, y = load_breast_cancer(return_X_y=True)


def fit_and_score(estimator, X_train, X_test, y_train, y_test):
    """Fit the estimator on the train set and score it on both sets"""
    estimator.fit(X_train, y_train, eval_set=[(X_test, y_test)])

    train_score = estimator.score(X_train, y_train)
    test_score = estimator.score(X_test, y_test)

    return estimator, train_score, test_score


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=94)

clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)

results = {}

for train, test in cv.split(X, y):
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    est, train_score, test_score = fit_and_score(
        clone(clf), X_train, X_test, y_train, y_test
    )
    results[est] = (train_score, test_score)

[0]	validation_0-logloss:0.47435
[1]	validation_0-logloss:0.37961
[2]	validation_0-logloss:0.31334
[3]	validation_0-logloss:0.26883
[4]	validation_0-logloss:0.23208
[5]	validation_0-logloss:0.21337
[6]	validation_0-logloss:0.19236
[7]	validation_0-logloss:0.18360
[8]	validation_0-logloss:0.17274
[9]	validation_0-logloss:0.17064
[10]	validation_0-logloss:0.16569
[11]	validation_0-logloss:0.15752
[12]	validation_0-logloss:0.15063
[13]	validation_0-logloss:0.14838
[14]	validation_0-logloss:0.15396
[15]	validation_0-logloss:0.15414
[0]	validation_0-logloss:0.45802
[1]	validation_0-logloss:0.34767
[2]	validation_0-logloss:0.27301
[3]	validation_0-logloss:0.22877
[4]	validation_0-logloss:0.19318
[5]	validation_0-logloss:0.17110
[6]	validation_0-logloss:0.15218
[7]	validation_0-logloss:0.13440
[8]	validation_0-logloss:0.13049
[9]	validation_0-logloss:0.12544
[10]	validation_0-logloss:0.12489
[11]	validation_0-logloss:0.12767
[12]	validation_0-logloss:0.12369
[13]	validation_0-logloss:0.12046


In [5]:
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])

[0]	validation_0-logloss:0.45121
[1]	validation_0-logloss:0.34254
[2]	validation_0-logloss:0.26194
[3]	validation_0-logloss:0.22309
[4]	validation_0-logloss:0.18734
[5]	validation_0-logloss:0.15369
[6]	validation_0-logloss:0.13510
[7]	validation_0-logloss:0.12163
[8]	validation_0-logloss:0.10814
[9]	validation_0-logloss:0.09494
[10]	validation_0-logloss:0.08578
[11]	validation_0-logloss:0.07599
[12]	validation_0-logloss:0.07139
[13]	validation_0-logloss:0.06514
[14]	validation_0-logloss:0.06103
[15]	validation_0-logloss:0.05669
[16]	validation_0-logloss:0.05490
[17]	validation_0-logloss:0.05099
[18]	validation_0-logloss:0.04994
[19]	validation_0-logloss:0.04871
[20]	validation_0-logloss:0.04538
[21]	validation_0-logloss:0.04585
[22]	validation_0-logloss:0.04402
[23]	validation_0-logloss:0.04377
[24]	validation_0-logloss:0.04376
[25]	validation_0-logloss:0.04260
[26]	validation_0-logloss:0.04145
[27]	validation_0-logloss:0.04131
[28]	validation_0-logloss:0.04021
[29]	validation_0-loglos

In [6]:
booster = clf.get_booster()
print(booster.num_boosted_rounds())


43


In [13]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import sys
import pandas as pd

sys.path.append(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\official\src')
from utils.params import get_topn_param_sets

DATA_PATH = Path(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\data\en_only')


TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv'
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'
PARAMS_PATH = Path(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\official\hyp_search\results')
RANDOM_SEED = 115

In [37]:
# train_df = pd.read_csv(DATA_PATH / TRAIN_DATA_NAME)
# test_df = pd.read_csv(DATA_PATH / TEST_DATA_NAME)
# train_df.dropna(axis=0, inplace=True)
# train_df.drop_duplicates(inplace=True)
# train_df.reset_index(drop=True, inplace=True)
# test_df.dropna(axis=0, inplace=True)
# test_df.drop_duplicates(inplace=True)
# test_df.reset_index(drop=True, inplace=True)
# train_df['label'] = train_df['label'].apply(lambda label: 0 if label =='notcb' else 1)
# test_df['class'] = test_df['class'].apply(lambda label: 1 if label == 0 else 0)
# x_train, x_val, y_train, y_val = train_test_split(
#     train_df['tweet'], 
#     train_df['label'], 
#     test_size=0.1, 
#     stratify=train_df['label'],
#     random_state=RANDOM_SEED
# ) # for early stopping
# x_test = test_df['tweet']
# y_test = test_df['class']
catb_param_results:pd.DataFrame = get_topn_param_sets(PARAMS_PATH, algo='catb', dataset='d1', n=10, sort_condition='f1_macro_mean')

In [38]:
len(catb_param_results)

10

In [36]:
pd.set_option('display.max_colwidth', None) 
for i in range(len(catb_param_results)):
    print(catb_param_results['params'].iloc[i])

{'vectorizer': "<class 'sklearn.feature_extraction.text.CountVectorizer'>", 'vectorizer__ngram_range': [1, 3], 'vectorizer__preprocessor': None, 'vectorizer__max_df': 0.5, 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 1000, 'classifier__rsm': 1, 'classifier__depth': 10, 'classifier': "<class 'catboost.core.CatBoostClassifier'>"}
{'vectorizer': "<class 'sklearn.feature_extraction.text.CountVectorizer'>", 'vectorizer__ngram_range': [1, 3], 'vectorizer__preprocessor': None, 'vectorizer__max_df': 0.9, 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 1000, 'classifier__rsm': 1, 'classifier__depth': 10, 'classifier': "<class 'catboost.core.CatBoostClassifier'>"}
{'vectorizer': "<class 'sklearn.feature_extraction.text.CountVectorizer'>", 'vectorizer__ngram_range': [1, 1], 'vectorizer__preprocessor': None, 'vectorizer__max_df': 0.9, 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 1000, 'classifier__rsm': 0.75, 'classifier__depth': 10, 'classifier': "<

In [3]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load sample data
data = load_iris()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', early_stopping_rounds=1)

# Fit the model on the training data
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Make predictions on the test data



[0]	validation_0-mlogloss:0.72604
[1]	validation_0-mlogloss:0.50842
[2]	validation_0-mlogloss:0.36744
[3]	validation_0-mlogloss:0.27203
[4]	validation_0-mlogloss:0.20487
[5]	validation_0-mlogloss:0.15866
[6]	validation_0-mlogloss:0.12503
[7]	validation_0-mlogloss:0.09808
[8]	validation_0-mlogloss:0.08104
[9]	validation_0-mlogloss:0.06622
[10]	validation_0-mlogloss:0.05690
[11]	validation_0-mlogloss:0.04823
[12]	validation_0-mlogloss:0.04289
[13]	validation_0-mlogloss:0.03730
[14]	validation_0-mlogloss:0.03349
[15]	validation_0-mlogloss:0.02955
[16]	validation_0-mlogloss:0.02698
[17]	validation_0-mlogloss:0.02506
[18]	validation_0-mlogloss:0.02271
[19]	validation_0-mlogloss:0.02059
[20]	validation_0-mlogloss:0.01938
[21]	validation_0-mlogloss:0.01889
[22]	validation_0-mlogloss:0.01871
[23]	validation_0-mlogloss:0.01803
[24]	validation_0-mlogloss:0.01780
[25]	validation_0-mlogloss:0.01736
[26]	validation_0-mlogloss:0.01704
[27]	validation_0-mlogloss:0.01727


Parameters: { "use_label_encoder" } are not used.



In [4]:
model.get_booster().num_boosted_rounds()

28

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

str(CountVectorizer(max_df=.2).__class__).split('.')[-1][:-2]


'CountVectorizer'

In [19]:
from pathlib import Path
PATH_TO_RESULTS = Path(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\official\experiments2.0\results')

import json

for file in PATH_TO_RESULTS.iterdir():
    with open(file, 'r') as f:
        data = json.load(f)
        results = data['results']
    for result in results:
        result['vectorizer'] = 'CountVectorizer' if 'CountVectorizer' in result['vectorizer'] else 'TfidfVectorizer'
        result['vectorizer_params']['preprocessor'] = None if result['vectorizer_params']['preprocessor'] == None else 'SnowballStemmer'




{'meta': {'name': 'exp1_catb_results.json', 'description': ''}, 'results': [{'report': {'0': {'precision': 0.9349391165229758, 'recall': 0.4947323864170821, 'f1-score': 0.6470642512225879, 'support': 21262.0}, '1': {'precision': 0.04566047792484676, 'recall': 0.41252006420545745, 'f1-score': 0.08222026713588738, 'support': 1246.0}, 'accuracy': 0.4901812688821752, 'macro avg': {'precision': 0.49029979722391126, 'recall': 0.4536262253112698, 'f1-score': 0.36464225917923765, 'support': 22508.0}, 'weighted avg': {'precision': 0.8857103452552813, 'recall': 0.4901812688821752, 'f1-score': 0.6157955643480532, 'support': 22508.0}}, 'classifier_params': {'learning_rate': 0.1, 'n_estimators': 173, 'rsm': 1, 'depth': 10}, 'vectorizer': 'CountVectorizer', 'vectorizer_params': {'ngram_range': [1, 3], 'preprocessor': None, 'max_df': 0.5}, 'val_f1_macro_mean': 0.7270263081559752, 'val_f1_weighted_mean': 0.8293618247450545, 'drop_in_f1_macro_mean': 0.3623840489767376, 'drop_in_f1_weighed_mean': 0.2135