# Hyperparameter optimization

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import fasttext
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna

import sys
from pathlib import Path
import os

In [2]:
path = Path(os.path.dirname(os.getcwd()))
path = str(path)
print(path)
sys.path.insert(1, path)

/Users/saideepbunny/Projects/HuffPost-News-classification


In [3]:
from utils.utils import preprocess_text, evaluate_model, get_embedding_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Reading data

In [4]:
df = pd.read_json(f'{path}/data/train_data.json', orient='records')
df

Unnamed: 0,headline,category,short_description,authors,date,headline_length,short_description_length
0,Stephen Colbert Hits Trump With The Perfect 'S...,ENTERTAINMENT,"""Late Show"" host has a correction for the pres...",Ed Mazza,2018-01-12,63,52
1,Dear Colleagues: We SUCK!,WELLNESS,"I'm not saying we are wrong. In fact, we may a...","David Katz, M.D., ContributorFounder, True Hea...",2015-03-10,25,220
2,Emily Fletcher Shares Guided Meditation Techni...,WELLNESS,Meditation doesn't have to be complicated. In ...,,2013-11-28,57,120
3,HuffPost Rise: What You Need To Know On Februa...,POLITICS,Welcome to the HuffPost Rise Morning Newsbrief...,,2016-02-15,51,103
4,3 Ways To Fight Overwhelm And Add Joy To Your ...,WELLNESS,Working moms are juggling more than ever befor...,"Paula Jenkins, ContributorLife Coach and Host ...",2016-07-09,50,123
...,...,...,...,...,...,...,...
61970,Owning An Assault Weapon Is No Longer A Fundam...,POLITICS,An appeals court wiped out an earlier ruling t...,Cristian Farias,2016-03-06,68,94
61971,Blackfish: Rooting for Killer Whales and Kille...,ENTERTAINMENT,A debate ensued that resonates for yet another...,"Regina Weinreich, Contributor\nAuthor, 'Keroua...",2013-06-22,69,121
61972,Trevor Noah Mockingly Praises Trump's 'Right R...,ENTERTAINMENT,"""Now I know your first instinct is to be disgu...",Lee Moran,2017-11-28,73,125
61973,Elite 'Bundlers' Raise More Than $113 Million ...,POLITICS,"Big backers include Ben Affleck, George Lucas,...","Michael Beckel, Center for Public Integrity",2016-09-23,65,81


## Preprocessing data

In [5]:
df['content'] = df['headline'] + df['short_description']
df['content_preprocessed'] = df['content'].apply(lambda x: preprocess_text(x))

In [6]:
target_map = {'ENTERTAINMENT':0, 'WELLNESS':1, 'POLITICS':2}
df['category'] = df['category'].map(target_map).copy()

## HPO for XGBClassifier

In [7]:
def train_estimator(estimator, X_train, y_train, X_test, y_test):

    #Training the model
    estimator.fit(X_train, y_train)

    #evaluating metrics
    metrics = evaluate_model(estimator, X_train, y_train, X_test, y_test, display_metrics=False)

    return metrics

In [8]:
def optimize_xgb(data_df, embedding_model):

    #loading embedding model
    model = fasttext.load_model(f"{path}/embeddings/{embedding_model}")

    embedding_dim = int(embedding_model.split('_')[-1].split('.')[0])

    #generating embeddings
    data_df, embed_cols = get_embedding_df(data_df, embedding_dim, model)

    #Defining train and test data
    X = data_df[embed_cols]
    y = data_df['category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Objective function for Optuna
    def objective(trial):
        # Define the hyperparameter space
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 2000),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'random_state': 42
        }

        # Initialize the XGBoost estimator with the sampled parameters
        estimator = XGBClassifier(**params)

        metrics = train_estimator(estimator, X_train, y_train, X_test, y_test)

        # Extract test F1 score
        score = metrics[1]['F1 Score']

        return score

    # Create Optuna study
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
    study.optimize(objective, n_trials=100)

    print('\n\nBest Parameters:', study.best_params)
    print('Best Score:', study.best_value)

    # Best hyperparameters
    return study.best_params, study.best_value

In [9]:
optimized_params = {}

### Fasttext 100 dimensions

In [10]:
optimized_params['fasttext_100'] = optimize_xgb(data_df=df,
                                               embedding_model='fasttext_model_100.bin')

[I 2024-12-19 09:52:59,037] A new study created in memory with name: no-name-87ded9a6-cd24-4559-9d94-5b98adece0a3
[I 2024-12-19 09:53:12,317] Trial 0 finished with value: 0.8929018901356397 and parameters: {'n_estimators': 1258, 'max_depth': 3, 'learning_rate': 0.05984922892483143, 'subsample': 0.6115743250907892, 'colsample_bytree': 0.5027875099720487, 'gamma': 2.6468488410263746, 'min_child_weight': 6}. Best is trial 0 with value: 0.8929018901356397.
[I 2024-12-19 09:53:27,722] Trial 1 finished with value: 0.8925962023178737 and parameters: {'n_estimators': 667, 'max_depth': 13, 'learning_rate': 0.050656077050676755, 'subsample': 0.8045136718083679, 'colsample_bytree': 0.9481221772767334, 'gamma': 2.630864419563432, 'min_child_weight': 8}. Best is trial 0 with value: 0.8929018901356397.
[I 2024-12-19 09:53:37,762] Trial 2 finished with value: 0.8917614216359858 and parameters: {'n_estimators': 1235, 'max_depth': 4, 'learning_rate': 0.21547494631772113, 'subsample': 0.9017605058094031



Best Parameters: {'n_estimators': 1950, 'max_depth': 8, 'learning_rate': 0.022077591497482247, 'subsample': 0.7634753736246074, 'colsample_bytree': 0.595895830410973, 'gamma': 0.5187973453257309, 'min_child_weight': 5}
Best Score: 0.8991325277909779


### Fasttext 200 dimensions

In [11]:
optimized_params['fasttext_200'] = optimize_xgb(data_df=df,
                                               embedding_model='fasttext_model_200.bin')

[I 2024-12-19 11:38:26,587] A new study created in memory with name: no-name-f82ecbe9-bcd1-49c4-bdd8-f61cd3f14b00
[I 2024-12-19 11:39:01,534] Trial 0 finished with value: 0.8914240727457409 and parameters: {'n_estimators': 257, 'max_depth': 11, 'learning_rate': 0.030201551372105673, 'subsample': 0.6542927001971175, 'colsample_bytree': 0.5733736605557086, 'gamma': 4.6674735076514215, 'min_child_weight': 10}. Best is trial 0 with value: 0.8914240727457409.
[I 2024-12-19 11:40:18,186] Trial 1 finished with value: 0.8892102697493368 and parameters: {'n_estimators': 1845, 'max_depth': 3, 'learning_rate': 0.020537460308575742, 'subsample': 0.9747737163773206, 'colsample_bytree': 0.9113335124713453, 'gamma': 4.260853852347495, 'min_child_weight': 2}. Best is trial 0 with value: 0.8914240727457409.
[I 2024-12-19 11:41:45,299] Trial 2 finished with value: 0.895350797965253 and parameters: {'n_estimators': 245, 'max_depth': 11, 'learning_rate': 0.05553670211813522, 'subsample': 0.814526252057958



Best Parameters: {'n_estimators': 1543, 'max_depth': 10, 'learning_rate': 0.08956455416161246, 'subsample': 0.515186619748011, 'colsample_bytree': 0.8809387466473658, 'gamma': 0.001962334672459482, 'min_child_weight': 7}
Best Score: 0.9003678706027675


### Fasttext 300 dimensions

In [12]:
optimized_params['fasttext_300'] = optimize_xgb(data_df=df,
                                               embedding_model='fasttext_model_300.bin')

[I 2024-12-19 13:42:21,654] A new study created in memory with name: no-name-f00ae812-81ee-4738-80bd-07edaca81175
[I 2024-12-19 13:45:33,776] Trial 0 finished with value: 0.8961509674157582 and parameters: {'n_estimators': 1276, 'max_depth': 19, 'learning_rate': 0.018821479220333257, 'subsample': 0.9199162108482205, 'colsample_bytree': 0.9608546783846963, 'gamma': 0.7610468950638599, 'min_child_weight': 9}. Best is trial 0 with value: 0.8961509674157582.
[I 2024-12-19 13:46:06,907] Trial 1 finished with value: 0.8859257647437015 and parameters: {'n_estimators': 566, 'max_depth': 4, 'learning_rate': 0.012767176267440316, 'subsample': 0.8187527590959722, 'colsample_bytree': 0.8187353193648133, 'gamma': 1.1055884826470452, 'min_child_weight': 6}. Best is trial 0 with value: 0.8961509674157582.
[I 2024-12-19 13:47:04,513] Trial 2 finished with value: 0.8937046929241919 and parameters: {'n_estimators': 1603, 'max_depth': 17, 'learning_rate': 0.0877216973040376, 'subsample': 0.53991791394318



Best Parameters: {'n_estimators': 1026, 'max_depth': 13, 'learning_rate': 0.07189768444047324, 'subsample': 0.7001442648881597, 'colsample_bytree': 0.5603789471330167, 'gamma': 0.007320763684901976, 'min_child_weight': 10}
Best Score: 0.9008312848695788


In [20]:
final_params = {}

for key, value in optimized_params.items():
    final_params[key] = value[0]
final_params

{'fasttext_100': {'n_estimators': 1950,
  'max_depth': 8,
  'learning_rate': 0.022077591497482247,
  'subsample': 0.7634753736246074,
  'colsample_bytree': 0.595895830410973,
  'gamma': 0.5187973453257309,
  'min_child_weight': 5},
 'fasttext_200': {'n_estimators': 1543,
  'max_depth': 10,
  'learning_rate': 0.08956455416161246,
  'subsample': 0.515186619748011,
  'colsample_bytree': 0.8809387466473658,
  'gamma': 0.001962334672459482,
  'min_child_weight': 7},
 'fasttext_300': {'n_estimators': 1026,
  'max_depth': 13,
  'learning_rate': 0.07189768444047324,
  'subsample': 0.7001442648881597,
  'colsample_bytree': 0.5603789471330167,
  'gamma': 0.007320763684901976,
  'min_child_weight': 10}}

In [21]:
params_df = pd.DataFrame(final_params).T
params_df

Unnamed: 0,n_estimators,max_depth,learning_rate,subsample,colsample_bytree,gamma,min_child_weight
fasttext_100,1950.0,8.0,0.022078,0.763475,0.595896,0.518797,5.0
fasttext_200,1543.0,10.0,0.089565,0.515187,0.880939,0.001962,7.0
fasttext_300,1026.0,13.0,0.071898,0.700144,0.560379,0.007321,10.0


In [None]:
params_df.to_json(f'{path}/data/xgb_optimized_params.json', index=True, orient='index')
