In [1]:
import numpy as np
from scipy.stats import rv_continuous

In [43]:
class CustomDistribution(rv_continuous):
    def rvs(self, *args, **kwargs):
        if np.random.rand() < 0.5:
            return None
        else:
            return np.random.normal(*args, **kwargs)

custom_dist = CustomDistribution(name='custom_dist')

In [56]:

custom_dist.rvs(size=10,loc=2,scale=0.1)


array([2.04302104, 1.99185057, 2.01461519, 1.97196757, 1.82350311,
       2.03865219, 2.00428072, 2.05048621, 1.99431688, 1.96660916])

In [1]:
import numpy as np
from scipy.stats import rv_continuous

class RandomRatio(rv_continuous):
    def __init__(self, ratio, *args, **kwargs):
        self.low = kwargs.pop('low', 0)
        self.high = kwargs.pop('high', 1)
        self.ratio = ratio
        super().__init__(*args, **kwargs)

    def rvs(self, *args, **kwargs):
        if np.random.rand() < 0.1:
            return None
        else:
            return self.ratio + np.random.uniform(self.low, self.high, *args, **kwargs)

custom_dist = RandomRatio(ratio=0.5, low=-.5, high=.5)


In [2]:
import pandas as pd

def balance_train_and_test(notcb_train:pd.DataFrame, cb_train:pd.DataFrame, notcb_test:pd.DataFrame, cb_test:pd.DataFrame, random_state:int):
    """
    This functions does not relabel the data that is moved!
    """
    x1 = notcb_train.shape[0]
    x2 = cb_train.shape[0]
    y1 = notcb_test.shape[0]
    y2 = cb_test.shape[0]

    x_hat = (y2 * x1 - x2 * y1) / (y2 + x2)

    
    if x_hat < 0: # take from the test and add to training
        np.testing.assert_allclose(ratio:=(y1 + x_hat)/(y2), (x1 - x_hat)/(x2), atol=1e-5)
        notcb_test = notcb_test.sample(frac=1, random_state=random_state)
        x_hat *= -1 
        data_to_move = notcb_test[:int(x_hat)]
        notcb_test = notcb_test[int(x_hat):]
        train_df = pd.concat([notcb_train, cb_train, data_to_move])
        test_df = pd.concat([notcb_test, cb_test])

    else: # take from training, add to test
        np.testing.assert_allclose(ratio:=(y1 + x_hat)/(y2), (x1 - x_hat)/(x2), atol=1e-5)
        notcb_train = notcb_train.sample(frac=1, random_state=random_state)
        data_to_move = notcb_train[:int(x_hat)]
        notcb_train = notcb_train[int(x_hat):]
        train_df = pd.concat([notcb_train, cb_train])
        test_df = pd.concat([notcb_test, cb_test, data_to_move])
    
    return train_df, test_df, x_hat, ratio # the new ratio of noncb to cb (in both sets)

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

from regex import B

path_to_data = Path('C:/Users/rooty/UWEC/Research/CyberBullyingML/venv/cyberbullying-ml/data/en_only')
TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv' 
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'
RANDOM_SEED = 115

train_df = pd.read_csv(path_to_data / TRAIN_DATA_NAME)
test_df = pd.read_csv(path_to_data / TEST_DATA_NAME)
train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df.rename(columns={'class': 'label'}, inplace=True)
# relabel the data such that the offensive class is cyberbullying and the nonoffensive class is not cyberbullying (the hatespeech class is not included)
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'notcb' else 1)
test_df['label'] = test_df['label'].map({0: 2, 1: 1, 2: 0})
# Balance the datasets
notcb_train = train_df[train_df['label'] == 0]
cb_train = train_df[train_df['label'] == 1]
notcb_test = test_df[test_df['label'] == 0]
cb_test = test_df[test_df['label'] == 1]
balanced_train, balanced_test, x_hat, ratio = balance_train_and_test(notcb_train, cb_train, notcb_test, cb_test, random_state=RANDOM_SEED)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def pp(text):
    return ''.join([char for char in text if ord(char) < 128])

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1,3), preprocessor=pp)
cv.fit(balanced_train['tweet'])
x_train = cv.transform(balanced_train['tweet'])
x_test = cv.transform(balanced_test['tweet'])
y_train = balanced_train['label']
y_test = balanced_test['label']

In [15]:
cv.get_feature_names_out()

array([' ', ' 0', ' 0 ', ..., 'zzw', 'zzy', 'zzz'], dtype=object)

In [16]:
import xgboost as xgb

m = xgb.XGBClassifier()
m.fit(x_train, y_train)

In [17]:
from sklearn.metrics import classification_report

# Perform predictions on x_test
y_pred = m.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.63      0.61      0.62      3243
           1       0.93      0.93      0.93     17476

    accuracy                           0.88     20719
   macro avg       0.78      0.77      0.77     20719
weighted avg       0.88      0.88      0.88     20719



In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from xgboost import XGBClassifier
import itertools


snowballer = SnowballStemmer('english')

def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])


In [35]:


xgb_param_grid = {
    'vectorizer': [CountVectorizer],
    'vectorizer__ngram_range': [(1,1), (1,3)],
    'vectorizer__preprocessor': [pp_SnowballStemmer, None],
    'vectorizer__max_df': [0.5, 0.75],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 500],
    'classifier__colsample_bytree': [0.5, 0.75],
    'classifier__max_depth': [6, 12],
    'classifier': [XGBClassifier]
}
iters = (v for _, v in xgb_param_grid.items())
keys = xgb_param_grid.keys()
param_sets = [params for params in itertools.product(*iters)]

In [36]:
# best: 47, 31, 77
len(param_sets)

128

In [2]:
import pandas as pd
df = pd.read_csv('C:/Users/rooty/UWEC/Research/CyberBullyingML/cyberbullyingml/cyberbullying-ml/data/en_only/48000_cyberbullying_tweets_basic_clean.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df['label'] = df['label'].apply(lambda x: 0 if x == 'notcb' else 1)
df.head()


Unnamed: 0,tweet,label
0,Here at home Neighbors pick on my family and I...,1
1,Being bullied at school Highachieving boys use...,1
2,There was a girl in my class in 6th grade who ...,1
3,He is probably a white gay kid from some subur...,1
4,You are pushed ti resorting Treating thr bulli...,1


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1,3), max_df=0.9)


In [7]:
x = cv.fit_transform(df['tweet'])
y = df['label']

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2820209 stored elements and shape (43675, 1092345)>