In [1]:
import numpy as np
import pandas as pd
from regex import P
from scipy.stats import randint, uniform, loguniform
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from nltk.stem import SnowballStemmer, WordNetLemmatizer, LancasterStemmer, PorterStemmer
import nltk
nltk.download('wordnet')

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import json
import random
import sys
import logging
from pathlib import Path

# GLOBALS
RUN = 'exp3_hyp_search'
TRAIN_DATA_NAME = '48000_cyberbullying_tweets_basic_clean.csv' 
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'
RANDOM_SEED = 115

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# FUNCTIONS
def get_OOV_feats(train_data:pd.Series, test_data:pd.Series, print_oov_feats:bool=False):
    v1 = CountVectorizer()
    v2 = CountVectorizer()

    v1.fit(train_data)
    v2.fit(test_data)

    feats_train = v1.get_feature_names_out()
    feats_test = v2.get_feature_names_out()

    oov_feats = np.setdiff1d(feats_test, feats_train)
    if print_oov_feats: print(f"OOV features: {oov_feats}")

    return oov_feats

def balance_train_and_test(notcb_train:pd.DataFrame, cb_train:pd.DataFrame, notcb_test:pd.DataFrame, cb_test:pd.DataFrame, random_state:int):
    """
    This functions does not relabel the data that is moved!
    """
    x1 = notcb_train.shape[0]
    x2 = cb_train.shape[0]
    y1 = notcb_test.shape[0]
    y2 = cb_test.shape[0]

    x_hat = (y2 * x1 - x2 * y1) / (y2 + x2)

    
    if x_hat < 0: # take from the test and add to training
        np.testing.assert_allclose(ratio:=(y1 + x_hat)/(y2), (x1 - x_hat)/(x2), atol=1e-5)
        notcb_test = notcb_test.sample(frac=1, random_state=random_state)
        x_hat *= -1 
        data_to_move = notcb_test[:int(x_hat)]
        notcb_test = notcb_test[int(x_hat):]
        train_df = pd.concat([notcb_train, cb_train, data_to_move])
        test_df = pd.concat([notcb_test, cb_test])

    else: # take from training, add to test
        np.testing.assert_allclose(ratio:=(y1 + x_hat)/(y2), (x1 - x_hat)/(x2), atol=1e-5)
        notcb_train = notcb_train.sample(frac=1, random_state=random_state)
        data_to_move = notcb_train[:int(x_hat)]
        notcb_train = notcb_train[int(x_hat):]
        train_df = pd.concat([notcb_train, cb_train])
        test_df = pd.concat([notcb_test, cb_test, data_to_move])
    
    return train_df, test_df, x_hat, ratio # the new ratio of noncb to cb (in both sets)

def get_top_5_results(random_search: RandomizedSearchCV):
    results = pd.DataFrame(random_search.cv_results_) 
    results.sort_values(by='mean_test_score', inplace=True, ascending=False)
    top_5 = results.head(5)
    return top_5[['params', 'mean_test_score', 'std_test_score']]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rooty\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
porter = PorterStemmer()
lancaster = LancasterStemmer()
pp = lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
pp1 = lambda x: ' '.join([stemmer.stem(word) for word in x.split()])
pp2 = lambda x: ' '.join([porter.stem(word) for word in x.split()])
pp3 = lambda x: ' '.join([lancaster.stem(word) for word in x.split()])

default_xgb_param_dists = {
    'vectorizer': [CountVectorizer(), TfidfVectorizer()],
    'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
    'vectorizer__analyzer': ['word', 'word', 'word', 'word', 'word', 'char', 'char_wb'],
    'vectorizer__max_df': uniform(0.299, 0.7),
    'vectorizer__preprocessor': [None, pp, pp1, pp2, pp3],
    'vectorizer__min_df': [1, 1, 2, 3],
    'classifier__booster': ['gbtree', 'dart'],
    'classifier__max_depth': randint(3, 10),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__n_estimators': randint(50, 1000),
    'classifier__min_child_weight': uniform(0.5, 9.5),
    'classifier__subsample': uniform(0.5, 0.5),
    'classifier__colsample_bytree': uniform(0.5, 0.5),
    'classifier__gamma': uniform(0, 1),
    'classifier__reg_alpha': loguniform(1e-5, 1),
    'classifier__reg_lambda': loguniform(1e-5, 1),
    'classifier__colsample_bylevel': uniform(0.5, 0.5),
    'classifier__colsample_bynode': uniform(0.5, 0.5),
    'classifier__random_state': randint(1, 10000)
}

In [3]:
from pathlib import Path

path_to_data = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\data\\en_only')
train_df = pd.read_csv(path_to_data / TRAIN_DATA_NAME)
test_df = pd.read_csv(path_to_data / TEST_DATA_NAME)
train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df.rename(columns={'class': 'label'}, inplace=True)

train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'notcb' else 1)
test_df['label'] = test_df['label'].map({0: 2, 1: 1, 2: 0})
# Balance the datasets
notcb_train = train_df[train_df['label'] == 0]
cb_train = train_df[train_df['label'] == 1]
notcb_test = test_df[test_df['label'] == 0]
cb_test = test_df[test_df['label'] == 1]
print(notcb_train.shape)
print(cb_train.shape)
print(notcb_test.shape)
print(cb_test.shape)

balanced_train, balanced_test, x_hat, ratio = balance_train_and_test(notcb_train, cb_train, notcb_test, cb_test, random_state=RANDOM_SEED)

(6377, 2)
(37298, 2)
(3786, 7)
(17476, 7)


In [11]:
param_dist = default_xgb_param_dists.copy() 
param_dist['classifier__scale_pos_weight'] = [None, ratio, ratio - 0.05, ratio + 0.1, ratio + 0.1, ratio + 0.2]

In [12]:
from xgboost import XGBClassifier
pipeline = Pipeline([
    ('vectorizer', 'passthrough'),
    ('classifier', XGBClassifier(objective='binary:logistic'))
])

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_SEED)

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=10, cv=skf, scoring='f1_macro', n_jobs=10, random_state=RANDOM_SEED, verbose=2)

balanced_train = balanced_train.sample(frac=1,random_state=1)
x = balanced_train['tweet'].iloc[:200]
y = balanced_train['label'].iloc[:200]

random_search.fit(x,y)

print(get_top_5_results(random_search))

Fitting 2 folds for each of 10 candidates, totalling 20 fits


KeyboardInterrupt: 

In [15]:
with open('log/exp3_hyp_search.log', 'w+') as f:
    pass