In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
#@title Imports and Installs
# !pip install xgboost

import pandas as pd
import numpy as np
import torch

import re

from scipy.stats.distributions import uniform, randint
import sys

import sklearn.model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score

import time

import xgboost as xgboost
from xgboost import XGBClassifier

In [None]:
cleanDataPath = '/content/drive/MyDrive/UWEC/ML Research w Dr. Vanamala/Data/bullying_light_clean_data.csv'
cleanDataPath2 = '/content/drive/MyDrive/UWEC/ML Research w Dr. Vanamala/Data/hatespeech_light_clean_data.csv'

In [None]:
df = pd.read_csv(cleanDataPath)
df2 = pd.read_csv(cleanDataPath2)

In [None]:
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

df2.dropna(axis=0, inplace=True)
df2.drop_duplicates(inplace=True)
df2.reset_index(drop=True, inplace=True)

In [None]:
# 0 - hate_speech, 1 - offensive_language, 2 - neither

In [None]:
df.cyberbullying_type.value_counts()

In [None]:
undersample_value = min(df.cyberbullying_type.value_counts())
undersample_value

In [None]:
# split classes
religion = df[df['cyberbullying_type'] == 'religion']
age = df[df['cyberbullying_type'] == 'age']
gender = df[df['cyberbullying_type'] == 'gender']
ethnicity = df[df['cyberbullying_type'] == 'ethnicity']
other_cyberbullying = df[df['cyberbullying_type'] == 'other_cyberbullying']
not_cyberbullying = df[df['cyberbullying_type'] == 'not_cyberbullying']

In [None]:
religion_us = religion.sample(undersample_value)
age_us = age.sample(undersample_value)
gender_us = gender.sample(undersample_value)
ethnicity_us = ethnicity.sample(undersample_value)
other_cyberbullying_us = other_cyberbullying.sample(undersample_value)

In [None]:
df = pd.concat([religion_us, age_us, gender_us, ethnicity_us, other_cyberbullying_us, not_cyberbullying], axis=0)

In [None]:
df.cyberbullying_type.value_counts()

In [None]:
# convert str labels to integers
df['cyberbullying_type'] = df['cyberbullying_type'].replace({
        'religion': 0,
        'age': 1,
        'gender': 2,
        'ethnicity': 3,
        'other_cyberbullying': 4,
        'not_cyberbullying': 5
    })

In [None]:
# Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.tweet_text)

# SBERT
# model = SentenceTransformer('all-MiniLM-L6-v2')
# X = model.encode(df.tweet_text)

# TF-IDF
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df.tweet_text)

In [None]:
# split data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, df.cyberbullying_type, test_size=0.2, random_state=115)

In [None]:
# control overfitting
# https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html

# parameters to mitigate overfitting
ptmo = {
    'max_depth': 4,
    'learning_rate': 0.02,
    'n_estimators': 600,
    'gamma': 1.0,
    'min_child_weight': 1.5,
    'subsample': 0.9, # percent of training data to use
    'colsample_bytree': 0.8,
    'eval_metric': 'auc'
}

In [None]:
xgb = XGBClassifier(objective='multi:softmax', random_state=115)
# fit model
xgb.fit(X_train, y_train)

In [None]:
preds = xgb.predict(X_test)

In [None]:
model1Results = 'Model 1 Accuracy : {0:0.4f}\n'. format(accuracy_score(y_test, preds)) + '\nModel 1 Classification Report:\n' + classification_report(y_test, preds) + '\n\n'

In [None]:
print(model1Results)

In [None]:
# class 3 - not many other classes predicted to be class 3 but a notable amount of the true class 3 were missed (ie 80% recall)
# class 4 - Many other classes were predicted to be class 4 but 80% of class4 was correctly predicited as class4
# class 5 - Many other classes were predicted to be class 5 and ~50% of class5 was not properly predicted to be class5

In [None]:
# TODO: Setup up Grid Search to find optimal params

Test Model(s) on other dataset

In [None]:
# 0 - hate_speech, 1 - offensive_language, 2 - neither
df2['class'] = df2['class'].apply(lambda x: x if x == 0 else 1)

In [None]:
df2.value_counts('class')

In [None]:
x, y = df2['tweet'], df2['class']

In [None]:
preds = xgb.predict(vectorizer.transform(x))

In [None]:
# convert preds to labels that are conducive with the new datasets labels
preds = [1 if x in [5] else 0 for x in preds]

In [None]:
model1Results = 'Model 1 Accuracy : {0:0.4f}\n'. format(accuracy_score(y, preds)) + '\nModel 1 Classification Report:\n' + classification_report(y, preds) + '\n\n'

In [None]:
print(model1Results)

In [None]:
# an inital test on this dataset shows that the model is heavily biased towards cyberbullying (see precision of class 0)
# to account for this, we should

In [None]:
# Run further tests.
# Ajdust the test sizes to reflect the training class distribution (NOTE: this is likely NOT consistent with real world data)

In [None]:
# x, y = df2['tweet'], df2['class']
# preds = xgb.predict(vectorizer.transform(x))
# preds = [1 if x in [3, 4, 5] else 0 for x in preds]
# print('Model 1 Accuracy : {0:0.4f}\n'. format(accuracy_score(y, preds)) + '\nModel 1 Classification Report:\n' + classification_report(y, preds) + '\n\n')

#Test
---
The following test serves to evaluate the generalizability of XGBoost when trained upon the cyberbullying dataset curated in https://people.cs.vt.edu/ctlu/Publication/2020/IEEE-BD-SOSNet-Wang.pdf

The test data is from https://arxiv.org/pdf/1703.04009

1. Balance the original dataset such that each "cyberbullying" class has 5632 entrys.

2. Also balance the test dataset such that the split is 50/50.

3. Use the remaining "not cyberbullying" from the test set for model training. Thus the training data will match the distribution of the test data (50/50 split between "cyberbullying" and "not cyberbullying")

(This is not necessarily a realistic representation of real-world data but it is a good initial test for the model)

4. Try all encoding types and try training the model with 2 classes (rather than 6).**bold text**




In [1]:
cleanDataPath = 'data/bullying_light_clean_data.csv'
cleanDataPath2 = 'data/hatespeech_light_clean_data.csv'

In [4]:
# Read in the data
df = pd.read_csv(cleanDataPath)
df2 = pd.read_csv(cleanDataPath2)

df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

df2.dropna(axis=0, inplace=True)
df2.drop_duplicates(inplace=True)
df2.reset_index(drop=True, inplace=True)

In [5]:
# split classes
religion = df[df['cyberbullying_type'] == 'religion']
age = df[df['cyberbullying_type'] == 'age']
gender = df[df['cyberbullying_type'] == 'gender']
ethnicity = df[df['cyberbullying_type'] == 'ethnicity']
other_cyberbullying = df[df['cyberbullying_type'] == 'other_cyberbullying']
not_cyberbullying = df[df['cyberbullying_type'] == 'not_cyberbullying']

In [6]:
undersample_value = 5150 # (20731 - 1219 + 6238) / 5

religion_us = religion.sample(undersample_value)
age_us = age.sample(undersample_value)
gender_us = gender.sample(undersample_value)
ethnicity_us = ethnicity.sample(undersample_value)
other_cyberbullying_us = other_cyberbullying.sample(undersample_value)

In [7]:
df = pd.concat([religion_us, age_us, gender_us, ethnicity_us, other_cyberbullying_us, not_cyberbullying], axis=0)

In [8]:
# convert str labels to integers
df['cyberbullying_type'] = df['cyberbullying_type'].replace({
        'religion': 0,
        'age': 1,
        'gender': 2,
        'ethnicity': 3,
        'other_cyberbullying': 4,
        'not_cyberbullying': 5
    })

  df['cyberbullying_type'] = df['cyberbullying_type'].replace({


In [9]:
df.cyberbullying_type.value_counts()

cyberbullying_type
5    6238
0    5150
1    5150
2    5150
3    5150
4    5150
Name: count, dtype: int64

In [13]:
len(df)

31988

In [14]:
# 0 - hate_speech, 1 - offensive_language, 2 - neither
df2['class'] = df2['class'].apply(lambda x: x if x == 0 else 1)

In [15]:
df2['class'].value_counts()

class
1    20731
0     1219
Name: count, dtype: int64

In [16]:
df2 = df2.sample(frac=1, random_state=115) # shuffle the data

# split by class
not_cyberbullying = df2[df2['class'] == 1]
cyberbullying = df2[df2['class'] == 0]

additional_not_cyberbullying = 19512 # 20731 - 1219

tmp = not_cyberbullying.iloc[:additional_not_cyberbullying] # 19512
not_cyberbullying = not_cyberbullying.iloc[additional_not_cyberbullying:] # 1219

# add the additional 'not cyberbullying' data to the training dataset
tmp.rename(columns={'tweet': 'tweet_text', 'class': 'cyberbullying_type'}, inplace=True)
tmp['cyberbullying_type'] = 5
df = pd.concat([df, tmp])

df2 = pd.concat([not_cyberbullying, cyberbullying])

- Sanity Check

In [17]:
df.cyberbullying_type.value_counts(sort=False) # 50/50 split between cyberbullying and not cyberbullying

cyberbullying_type
0     5150
1     5150
2     5150
3     5150
4     5150
5    25750
Name: count, dtype: int64

In [19]:
len(df)

51500

In [18]:
df2['class'].value_counts()

class
1    1219
0    1219
Name: count, dtype: int64

In [34]:
x, y = df2['tweet'], df2['class']

In [35]:
# Split training data
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(df.tweet_text, df.cyberbullying_type, test_size=0.2, random_state=115)

In [36]:
# Bag of Words
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)

# SBERT
# model = SentenceTransformer('all-MiniLM-L6-v2')
# X = model.encode(df.tweet_text)

# TF-IDF
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df.tweet_text)

y_train = Y_train
Y_val = Y_val

In [None]:
# Justification for random search : https://dl.acm.org/doi/pdf/10.5555/2188385.2188395

In [None]:
def tune_hyperparameters(base_model, parameters, n_iter, kfold, X, Y, X_val=None, Y_val=None, SEED=115):
    start_time = time.time()

    # Arrange data into folds with approx equal proportion of classes within each fold
    k = KFold(kfold)

    optimal_model = RandomizedSearchCV(
                            base_model,
                            param_distributions=parameters,
                            n_iter=n_iter,
                            cv=k,
                            random_state=SEED,
                            scoring='neg_log_loss',
                            n_jobs=1,
                            verbose=3,
                            error_score='raise'
                            )

    optimal_model.fit(X, Y)#,eval_set=zip(X_val, Y_val))

    stop_time = time.time()

    scores = cross_val_score(optimal_model, X, Y, cv=k, scoring="accuracy")

    print("Elapsed Time:", time.strftime("%H:%M:%S", time.gmtime(stop_time - start_time)))
    print("====================")
    print("Cross Val Mean: {:.3f}, Cross Val Stdev: {:.3f}".format(scores.mean(), scores.std()))
    print("Best Score: {:.3f}".format(optimal_model.best_score_))
    print("Best Parameters: {}".format(optimal_model.best_params_))

    return optimal_model

In [None]:
# Parameter Searching Guide : https://www.kaggle.com/code/willkoehrsen/intro-to-model-tuning-grid-and-random-search

In [38]:
N_FOLDS = 2

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=115)

def objective(hyperparameters, iteration):
  """Objective function for grid and random search. Returns
      the cross validation score from a set of hyperparameters."""
   
  cv_results = cross_val_score(m, x_train, y_train, cv=skf) # NOTE : use xgb's internal scoring method
  
  return [cv_results.mean(), cv_results.std(), hyperparameters, iteration]

In [39]:
# https://www.kaggle.com/code/willkoehrsen/intro-to-model-tuning-grid-and-random-search#Random-Search

import random

def random_search(param_grid, objective, max_evals=10):
    """Random search for hyperparameter optimization"""
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score (mean)', 'score (std)', 'params', 'iteration'],
                                  index = list(range(max_evals)))
    
    # Keep searching until reach max evaluations
    for i in range(max_evals):
        
        # Choose random hyperparameters
        hyperparameters = {}
        for k,v in param_grid.items():
            v = random.sample(v,1)[0] if isinstance(v, (list, tuple)) else v.rvs(1)[0]
            hyperparameters[k] = v        

        start_time = time.time()
        # Evaluate randomly selected hyperparameters
        eval_results = objective(hyperparameters, i)
        
        stop_time = time.time()
                
        results.loc[i, :] = eval_results        
        
        print("Elapsed Time: {i} fd {time.strftime("%H:%M:%S", time.gmtime(stop_time - start_time))}")
    
    # Sort with best score on top
    results.sort_values('score (mean)', ascending=False, inplace=True)
    results.reset_index(inplace = True, drop=True)
    return results 

In [40]:
# stats.uniform(0.5, 0.9)

parameters = {
    'booster'              : ['gbtree','dart'],
    'n_estimators'         : [100, 200, 300, 400, 500, 600],
    'learning_rate'        : uniform(0.0001, 0.1),
    # 'max_depth'            : randint(3, 100),
    # 'min_child_weight'     : randint(1, 50),
    # 'subsample'            : uniform(sys.float_info.min, 1),
    # 'colsample_bytree'     : uniform(0,1),
    # 'colsample_bylevel'    : uniform(0,1),
    # 'colsample_bynode'     : uniform(0,1),
    # 'n_estimators'         : [100,200,300,400,500,600],
    # 'alpha'                : uniform(0,10),
    # 'lambda'               : uniform(0,10),
    # 'gamma'                : uniform(0, 100),
    # 'eta'                  : uniform(0,1)
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'
m = XGBClassifier(objective='multi:softmax', num_class=6, device=device, n_jobs=-1, verbosity=1, random_state=115)

# Verify that the following are properly defined
print(f'Device: {device}\n')
print(f'Input Type: {type(x_train)}\n\nInput Sample:\n{x[:2]}\n')
print(f'Target Typep: {type(y_train)}\n\nTarget Sample:\n{y[:2]}\n')
print(f'Model: {m}\n')

Device: cpu

Input Type: <class 'scipy.sparse._csr.csr_matrix'>

Input Sample:
12189     twitter tipped its cap to say farewellcaptain...
15842     when you give all ya hoes up for that one gir...
Name: tweet, dtype: object

Target Typep: <class 'pandas.core.series.Series'>

Target Sample:
12189    1
15842    1
Name: class, dtype: int64

Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_str

In [None]:
parameters.items()

In [41]:
result = random_search(parameters, objective, max_evals=2)

Elapsed Time: 00:00:10
Elapsed Time: 00:00:10


In [42]:
result

Unnamed: 0,score (mean),score (std),params,iteration
0,0.868495,0.000534,"{'booster': 'gbtree', 'n_estimators': 300, 'le...",0
1,0.868495,0.000534,"{'booster': 'gbtree', 'n_estimators': 500, 'le...",1


In [None]:
d = randint(1,5)
d.rvs(1)[0]

In [None]:
result

In [None]:
u = uniform(0,1)
u.rvs(1)

In [None]:
results.std()

In [16]:
import pandas as pd
results = pd.DataFrame(columns = ['score (mean)', 'score (std)', 'params'], index = list(range(10)))

In [18]:
import numpy as np
results.loc[0,:] = [1,2,3]

In [19]:
results

Unnamed: 0,score (mean),score (std),params
0,1.0,2.0,3.0
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,,,
8,,,
9,,,
