In [11]:
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk.stem.snowball import SnowballStemmer

import pandas as pd
import numpy as np

import random
import sys
import os
import logging
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent.parent))

from src.utils import params, results, data

In [None]:
TRAIN_DATA_NAME = 'onlineHarassmentDataset_basic_clean.csv'
TEST_DATA_NAME = 'hatespeech_tweets_basic_clean.csv'

RANDOM_SEED = 115
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

DATA_PATH = Path.cwd().parent.parent.parent.parent / 'data' / 'en_only'
PARAMS_PATH = Path.cwd().parent.parent / 'hyp_search' / 'results'
RESULT_PATH = Path.cwd().parent / 'oov_results'

FNAME = 'exp6_xgb_oov'

In [None]:

train_df = pd.read_csv(DATA_PATH / TRAIN_DATA_NAME)
test_df = pd.read_csv(DATA_PATH / TEST_DATA_NAME)

train_df.dropna(axis=0, inplace=True)
train_df.drop_duplicates(inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df.dropna(axis=0, inplace=True)
test_df.drop_duplicates(inplace=True)
test_df.reset_index(drop=True, inplace=True)

train_df['Code'] = train_df['Code'].apply(lambda label: 0 if label == 'N' else 1)
test_df['class'] = test_df['class'].apply(lambda label: 1 if label == 0 else 0)

x_train, x_val, y_train, y_val = train_test_split(
    train_df['Tweet'], 
    train_df['Code'], 
    test_size=0.1,
    stratify=train_df['Code'], 
    random_state=RANDOM_SEED
) # for early stopping

x_test = test_df['tweet']
y_test = test_df['class']

xgb_param_results:pd.DataFrame = params.get_topn_param_sets(PARAMS_PATH, algo='xgb', dataset='d3', n=10, sort_condition='f1_macro_mean')

xgb_param_sets = xgb_param_results['params']

snowballer = SnowballStemmer('english')
def pp_SnowballStemmer(text):
    return ' '.join([snowballer.stem(word) for word in text.split()])

results_file = RESULT_PATH/f'{FNAME}.json'

results.create_results_file(results_file)

for i, params in enumerate(xgb_param_sets):

    vectorizer_cls = CountVectorizer if params['vectorizer'] == "<class 'sklearn.feature_extraction.text.CountVectorizer'>" else TfidfVectorizer

    vect_params = {}

    for k in params.keys():
        if 'vectorizer__' in k:
            p_name = k.split('__')[1]            
            if p_name == 'ngram_range':
                vect_params[p_name] = tuple(params[k])  
            elif p_name == 'preprocessor':
                vect_params[p_name] = pp_SnowballStemmer if isinstance(params[k], str) else None
            else:
                vect_params[p_name] = params[k]

    num_of_oov_feats = len(data.get_OOV_feats(x_train, x_test, print_oov_feats=False, **vect_params))
    