In [1]:
import psycopg2
import pandas as pd
import pickle
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna

pd.set_option('display.max_columns', None)
random_state = 42

#### Query data from PostgreSQL table, save to csv

In [105]:
conn = psycopg2.connect(host="localhost", dbname='cs', user='postgres', password='123456', port=5432)
cur = conn.cursor()

query = """SELECT * FROM side_dataset where id>=684"""
cur.execute(query)

rows = cur.fetchall()
columns = [desc[0] for desc in cur.description]
df = pd.DataFrame(rows, columns=columns)

cur.close()
conn.close()

df.to_csv('df_642.csv')

#### Begin

In [3]:
data = pd.read_csv('df_642.csv')
#First 4 rows are not needed
df = data.iloc[:,5:]

#### While parsing the data, I was storing this particular stat (average rounds lost(won) on a map) as 0's in case of missing data. So, I convert them to NULL, to deal with them further. Also deelting the rows with incorrect data

In [4]:
df.loc[(df['t1_rounds_lost']>0) & (df['t1_rounds_won'] == 0), 't1_rounds_won'] = None
df.loc[(df['t1_rounds_lost']==0) & (df['t1_rounds_won'] > 0), 't1_rounds_lost'] = None
df.loc[(df['t2_rounds_lost']==0) & (df['t2_rounds_won'] > 0), 't2_rounds_lost'] = None
df.loc[(df['t2_rounds_lost']>0) & (df['t2_rounds_won'] == 0), 't2_rounds_won'] = None

df.drop(df[(df['t2_5v4']==0) & (df['t2_played']>0)].index, axis=0, inplace=True)
df.drop(df[(df['t1_5v4']==0) & (df['t1_played']>0)].index, axis=0, inplace=True)

#### Try filling NULL values with 13's

In [5]:
df['t1_rounds_lost'].fillna(13, inplace=True)
df['t1_rounds_won'].fillna(13, inplace=True)
df['t2_rounds_lost'].fillna(13, inplace=True)
df['t2_rounds_won'].fillna(13, inplace=True)

### Experiment 1. The differences between team stats will be the predicting features, so creating the features

In [77]:
cols_name = [df.columns[i]+'_dif' for i in range(0,len(df.columns[:22]),2)] + [df.columns[i]+'_dif' for i in range(22, 24)] + [df.columns[i]+'_dif' for i in range(25, 33)]

dataset = pd.DataFrame(columns=cols_name)

for index, name in enumerate(cols_name[:11]):
    dataset[name] = df[df.columns[index*2]] - df[df.columns[index*2+1]]

for index, name in enumerate(cols_name[11:13]):
    dataset[name] = df[df.columns[22+index]] - df[df.columns[33+index]]

for index, name in enumerate(cols_name[13:]):
    dataset[name] = df[df.columns[25+index]] - df[df.columns[36+index]]

dataset['t1_fp'] = df['t1_fp']
dataset['t2_fp'] = df['t2_fp']
dataset['result'] = df['result']

#### Preparing data

In [78]:
scaler = StandardScaler()

columns = ['t1_winstreak_dif','t1_h2h_dif','t1_ranking_dif','t1_pluses_dif','t1_minuses_dif','t1_coef_dif','t1_rating_dif','t1_event_rating_dif', \
            't1_num_maps_dif','t1_avg_lost_dif','t1_avg_won_dif','t1_rounds_lost_dif','t1_rounds_won_dif','t1_fp_percent_dif','t1_winrate_dif', \
            't1_played_dif','t1_map_winstreak_dif','t1_map_losestreak_dif','t1_5v4_dif','t1_4v5_dif','t1_pistol_dif']

scaler.fit(dataset.loc[:,columns])
dataset.loc[:,columns] = scaler.transform(dataset.loc[:,columns])

with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)

dataset.to_csv('just_scaled.csv', index=False)

In [2]:
dataset = pd.read_csv('just_scaled.csv')

Y = dataset.loc[:,['result']]
X = dataset.drop('result',axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=100, shuffle=True, random_state=42)
# x_val, x_test, y_val, y_test = train_test_split(x_rest, y_rest, test_size=100, shuffle=True, random_state=42)
print(x_train.shape)
# print(x_val.shape)
print(x_test.shape)

(519, 23)
(100, 23)


#### Modelling

In [82]:
defaults = pd.DataFrame([['XGBoost',0],['CatBoost',0],['LightGBM',0]],columns=['clf_name', 'clf_accuracy'])
models = [XGBClassifier(random_state=random_state), CatBoostClassifier(random_state=random_state, verbose=0), LGBMClassifier(random_state=random_state)]

cv = KFold(n_splits=5)

for index, model in enumerate(models):
    accuracies = []  # List to store the accuracy of each fold
    for train_index, test_index in cv.split(x_train):
        fx_train, fx_test = x_train.iloc[train_index], x_train.iloc[test_index]
        fy_train, fy_test = y_train.iloc[train_index], y_train.iloc[test_index]

        model.fit(fx_train, fy_train.values.ravel())

        preds = model.predict(fx_test)

        if index == 1:
            preds = [True if item=='True' else False for item in model.predict(fx_test)]

        accuracies.append(accuracy_score(fy_test, preds))
    
    mean_accuracy = np.mean(accuracies)
    defaults.loc[index, 'clf_accuracy'] = mean_accuracy

defaults.sort_values(by='clf_accuracy', ascending=False).head()

Unnamed: 0,clf_name,clf_accuracy
1,CatBoost,0.564442
2,LightGBM,0.549057
0,XGBoost,0.527903


#### Finetuning

In [None]:
def cb_objective(trial, X, y):
    param_grid = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("max_depth", 3, 12),
        #"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300, step=1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 0, 20, step=1),
        "colsample_bylevel": trial.suggest_float(
            "colsample_bylevel", 0.5, 1, step=0.1
        ),
        "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.1),
        "random_strength": trial.suggest_float("random_strength", 0, 1, step=0.1),
        #"grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree"]),
        "bagging_temperature": trial.suggest_float('bagging_temperature', 0, 20, step=0.25)
    }

    cv = KFold(n_splits=5)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = CatBoostClassifier(
                              **param_grid,
                              iterations=10000,
                              loss_function='Logloss',
                              random_seed=42,
                              early_stopping_rounds=50,
                              eval_metric='Accuracy',
                              verbose=0
                              )
 
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            verbose=False,
        )
        preds = [True if item=='True' else False for item in model.predict(X_test)]
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="maximize", study_name="CB Classifier")
func = lambda trial: cb_objective(trial, x_train, np.ravel(y_train.values))
study.optimize(func, n_trials=100)

print(f"\tBest value : {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

Best value : 0.65317
	Best params:
		learning_rate: 0.13803526386053072
		max_depth: 4
		l2_leaf_reg: 19
		colsample_bylevel: 1.0
		subsample: 0.7
		random_strength: 1.0
		bagging_temperature: 4.5

In [None]:
def xgb_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "eta": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10, step=1),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 20, step=2),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 20, step=2),
        "gamma": trial.suggest_int("gamma", 0, 10, step=1),
        "colsample_bytree": trial.suggest_float(
            "colsample_bytree", 0.5, 1, step=0.1
        ),
        "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.1),
    }

    cv = KFold(n_splits=5)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = XGBClassifier(objective='binary:logistic', **param_grid, early_stopping_rounds=50, eval_metric="error", verbosity=0, n_jobs=4, random_state=42)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            verbose=False,
        )
        preds = model.predict(X_test)
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="maximize", study_name="XGB Classifier")
func = lambda trial: xgb_objective(trial, x_train, np.ravel(y_train.values))
study.optimize(func, n_trials=200)

print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

Best value: 0.66288
	Best params:
		n_estimators: 10000
		learning_rate: 0.22707789895194652
		max_depth: 3
		min_child_weight: 2
		reg_lambda: 14
		reg_alpha: 8
		gamma: 0
		colsample_bytree: 0.9
		subsample: 0.5

In [None]:
def lgb_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 30),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 20),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 20),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.6, 1, step=0.1
        ),
        "bagging_freq": trial.suggest_int("bagging_freq", 5, 50, step=5),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.6, 1, step=0.1
        ),
        "subsample": trial.suggest_float("subsample", 0.6, 1, step=0.1)
    }

    cv = KFold(n_splits=5)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = LGBMClassifier(objective="binary", **param_grid, random_state=42, verbosity=-1, early_stopping_rounds=50)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_error",
            verbose=False
        )
        preds = model.predict(X_test)
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: lgb_objective(trial, x_train, np.ravel(y_train.values))
study.optimize(func, n_trials=100)

print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

Best value: 0.63010
	Best params:
		n_estimators: 10000
		learning_rate: 0.2314223134242193
		num_leaves: 2020
		max_depth: 3
		min_data_in_leaf: 4
		lambda_l1: 0
		lambda_l2: 5
		min_gain_to_split: 0.6299949073662396
		bagging_fraction: 1.0
		bagging_freq: 45
		feature_fraction: 0.8
		subsample: 0.6

#### Finding the optimal number of n_estimators for the chosen hyperparameters

In [22]:
# XGBOOST
model = XGBClassifier(
        objective='binary:logistic', 
        n_estimators=10000,
		learning_rate=0.22707789895194652,
		max_depth=3,
		min_child_weight=2,
		reg_lambda=14,
		reg_alpha=8,
		gamma=0,
		colsample_bytree=0.9,
		subsample=0.5,
        early_stopping_rounds=50,
        eval_metric="error",
        verbosity=0,
        n_jobs=4,
        random_state=42
    )

cv = KFold(n_splits=5)
iters = np.empty(5)

for idx, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    fx_train, fx_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    fy_train, fy_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model.fit(
        fx_train,
        fy_train,
        eval_set=[(fx_test, fy_test)],
        verbose=False,
    )
    iters[idx] = model.best_iteration

xgb_iters = np.mean(iters)
print(xgb_iters)

33.6


In [3]:
#CatBoost
model = CatBoostClassifier(
        learning_rate= 0.13803526386053072,
		max_depth= 4,
		l2_leaf_reg= 19,
		colsample_bylevel= 1.0,
		subsample= 0.7,
		random_strength= 1.0,
		bagging_temperature= 4.5,
        iterations=10000,
        loss_function='Logloss',
        random_seed=42,
        early_stopping_rounds=50,
        eval_metric='Accuracy',
        verbose=0
        )

cv = KFold(n_splits=5)
iters = np.empty(5)

for idx, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    fx_train, fx_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    fy_train, fy_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model.fit(
        fx_train,
        fy_train,
        eval_set=[(fx_test, fy_test)],
        verbose=False,
    )
    iters[idx] = model.get_best_iteration()

cb_iters = np.mean(iters)
print(cb_iters)

19.6


In [None]:
#LGB
model = LGBMClassifier(
        objective="binary", 
        random_state=42, 
        verbosity=-1, 
        early_stopping_rounds=50,
        n_estimators=10000,
		learning_rate=0.2314223134242193,
		num_leaves=2020,
		max_depth=3,
		min_data_in_leaf=4,
		lambda_l1=0,
		lambda_l2=5,
		min_gain_to_split=0.6299949073662396,
		bagging_fraction=1.0,
		bagging_freq=45,
		feature_fraction=0.8,
		subsample=0.6
        )

cv = KFold(n_splits=5)
iters = np.empty(5)

for idx, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    fx_train, fx_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    fy_train, fy_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model.fit(
        fx_train,
        fy_train,
        eval_set=[(fx_test, fy_test)],
        eval_metric="binary_error",
        verbose=False,
    )
    iters[idx] = model.best_iteration_

lgb_iters = np.mean(iters)
print(lgb_iters)

In [7]:
print(lgb_iters)

8.0


#### Testing on test set

In [24]:
xgb = XGBClassifier(
        objective='binary:logistic', 
        n_estimators=34,
		learning_rate=0.22707789895194652,
		max_depth=3,
		min_child_weight=2,
		reg_lambda=14,
		reg_alpha=8,
		gamma=0,
		colsample_bytree=0.9,
		subsample=0.5,
        verbosity=0,
        n_jobs=4,
        random_state=42
    )

xgb.fit(x_train, y_train, verbose=False)
preds = xgb.predict(x_test)
print(accuracy_score(y_test, preds))

0.52


In [8]:
cb = CatBoostClassifier(
        learning_rate= 0.13803526386053072,
		max_depth= 4,
		l2_leaf_reg= 19,
		colsample_bylevel= 1.0,
		subsample= 0.7,
		random_strength= 1.0,
		bagging_temperature= 4.5,
        iterations=20,
        loss_function='Logloss',
        random_seed=42,
        verbose=0
)   

cb.fit(x_train, y_train, verbose=False)
preds = [True if item=='True' else False for item in cb.predict(x_test)]
print(accuracy_score(y_test, preds))

0.56


In [None]:
lgb = LGBMClassifier(
        objective="binary", 
        random_state=42, 
        verbosity=-1, 
        n_estimators=8,
		learning_rate=0.2314223134242193,
		num_leaves=2020,
		max_depth=3,
		min_data_in_leaf=4,
		lambda_l1=0,
		lambda_l2=5,
		min_gain_to_split=0.6299949073662396,
		bagging_fraction=1.0,
		bagging_freq=45,
		feature_fraction=0.8,
		subsample=0.6
)

lgb.fit(x_train, y_train.values.ravel())
preds = lgb.predict(x_test)
print(accuracy_score(y_test, preds))

### Experiment 2. All features + differences

#### Prepare data

In [6]:
cols_name = [df.columns[i]+'_dif' for i in range(0,len(df.columns[:22]),2)] + [df.columns[i]+'_dif' for i in range(22, 24)] + [df.columns[i]+'_dif' for i in range(25, 33)]

for index, name in enumerate(cols_name[:11]):
    df[name] = df[df.columns[index*2]] - df[df.columns[index*2+1]]

for index, name in enumerate(cols_name[11:13]):
    df[name] = df[df.columns[22+index]] - df[df.columns[33+index]]

for index, name in enumerate(cols_name[13:]):
    df[name] = df[df.columns[25+index]] - df[df.columns[36+index]]

df.to_csv('all+diffs.csv',index=False)

In [17]:
dataset = pd.read_csv('all+diffs.csv')

Y = dataset.loc[:,'result']
X = dataset.drop('result',axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=100, shuffle=True, random_state=42)
print(x_train.shape)
print(x_test.shape)

(519, 65)
(100, 65)


#### Modelling

In [18]:
defaults = pd.DataFrame([['XGBoost',0],['CatBoost',0],['LightGBM',0]],columns=['clf_name', 'clf_accuracy'])
models = [XGBClassifier(random_state=random_state), CatBoostClassifier(random_state=random_state, verbose=0), LGBMClassifier(random_state=random_state)]

cv = KFold(n_splits=5)

for index, model in enumerate(models):
    accuracies = []  # List to store the accuracy of each fold
    for train_index, test_index in cv.split(x_train):
        fx_train, fx_test = x_train.iloc[train_index], x_train.iloc[test_index]
        fy_train, fy_test = y_train.iloc[train_index], y_train.iloc[test_index]

        model.fit(fx_train, fy_train.values.ravel())

        preds = model.predict(fx_test)

        if index == 1:
            preds = [True if item=='True' else False for item in model.predict(fx_test)]

        accuracies.append(accuracy_score(fy_test, preds))
    
    mean_accuracy = np.mean(accuracies)
    defaults.loc[index, 'clf_accuracy'] = mean_accuracy

defaults.sort_values(by='clf_accuracy', ascending=False).head()

Unnamed: 0,clf_name,clf_accuracy
1,CatBoost,0.591561
2,LightGBM,0.54916
0,XGBoost,0.53764


#### Fine-tuning

In [None]:
#CatBoost
def cb_objective(trial, X, y):
    param_grid = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 50, step=1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 0, 20, step=1),
        "colsample_bylevel": trial.suggest_float(
            "colsample_bylevel", 0.5, 1, step=0.1
        ),
        "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.1),
        "random_strength": trial.suggest_float("random_strength", 0, 1, step=0.1),
        #"grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree"]),
        "bagging_temperature": trial.suggest_float('bagging_temperature', 0, 20, step=0.25)
    }

    cv = KFold(n_splits=5)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = CatBoostClassifier(
                              **param_grid,
                              iterations=10000,
                              loss_function='Logloss',
                              random_seed=42,
                              early_stopping_rounds=50,
                              eval_metric='Accuracy',
                              verbose=0
                              )
 
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            verbose=False,
        )
        preds = [True if item=='True' else False for item in model.predict(X_test)]
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="maximize", study_name="CB Classifier")
func = lambda trial: cb_objective(trial, x_train, np.ravel(y_train.values))
study.optimize(func, n_trials=200)

print(f"\tBest value : {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

Best value : 0.65902
	Best params:
		learning_rate: 0.05351839411162072
		max_depth: 4
		min_data_in_leaf: 31
		l2_leaf_reg: 3
		colsample_bylevel: 1.0
		subsample: 0.5
		random_strength: 0.8
		bagging_temperature: 10.25

In [None]:
#XGBoost
def xgb_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "eta": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 0, 10, step=1),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 20, step=2),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 20, step=2),
        "gamma": trial.suggest_int("gamma", 0, 10, step=1),
        "colsample_bytree": trial.suggest_float(
            "colsample_bytree", 0.5, 1, step=0.1
        ),
        "subsample": trial.suggest_float("subsample", 0.5, 1, step=0.1),
    }

    cv = KFold(n_splits=5)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = XGBClassifier(objective='binary:logistic', **param_grid, early_stopping_rounds=50, eval_metric="error", verbosity=0, n_jobs=4, random_state=42)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            verbose=False,
        )
        preds = model.predict(X_test)
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="maximize", study_name="XGB Classifier")
func = lambda trial: xgb_objective(trial, x_train, np.ravel(y_train.values))
study.optimize(func, n_trials=200)

print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

Best value: 0.64554
	Best params:
		n_estimators: 10000
		learning_rate: 0.060726041010120094
		max_depth: 4
		min_child_weight: 9
		reg_lambda: 2
		reg_alpha: 0
		gamma: 2
		colsample_bytree: 0.6
		subsample: 0.5

In [None]:
#LGBClassifier
def lgb_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 30),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 20),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 20),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.6, 1, step=0.1
        ),
        "bagging_freq": trial.suggest_int("bagging_freq", 5, 50, step=5),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.6, 1, step=0.1
        ),
        "subsample": trial.suggest_float("subsample", 0.6, 1, step=0.1)
    }

    cv = KFold(n_splits=5)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = LGBMClassifier(objective="binary", **param_grid, random_state=42, verbosity=-1, early_stopping_rounds=50)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_error",
            verbose=False
        )
        preds = model.predict(X_test)
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: lgb_objective(trial, x_train, np.ravel(y_train.values))
study.optimize(func, n_trials=200)

print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

Best value: 0.62431
	Best params:
		n_estimators: 10000
		learning_rate: 0.09554692701030004
		num_leaves: 2860
		max_depth: 3
		min_data_in_leaf: 9
		lambda_l1: 0
		lambda_l2: 18
		min_gain_to_split: 4.111913891598791
		bagging_fraction: 1.0
		bagging_freq: 15
		feature_fraction: 0.6
		subsample: 1.0

#### Finding optimal number of iterations

In [26]:
# XGBOOST
model = XGBClassifier(
        objective='binary:logistic', 
        n_estimators=10000,
		learning_rate=0.060726041010120094,
		max_depth=4,
		min_child_weight=9,
		reg_lambda=2,
		reg_alpha=0,
		gamma=2,
		colsample_bytree=0.6,
		subsample=0.5,
        early_stopping_rounds=50,
        eval_metric="error",
        verbosity=0,
        n_jobs=4,
        random_state=42
    )

cv = KFold(n_splits=5)
iters = np.empty(5)

for idx, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    fx_train, fx_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    fy_train, fy_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model.fit(
        fx_train,
        fy_train,
        eval_set=[(fx_test, fy_test)],
        verbose=False,
    )
    iters[idx] = model.best_iteration

xgb_iters = np.mean(iters)
print(xgb_iters)

28.4


In [27]:
#CatBoost
model = CatBoostClassifier(
        learning_rate= 0.05351839411162072,
		max_depth= 4,
        min_data_in_leaf= 31,
		l2_leaf_reg= 3,
		colsample_bylevel= 1.0,
		subsample= 0.5,
		random_strength= 0.8,
		bagging_temperature= 10.25,
        iterations=10000,
        loss_function='Logloss',
        random_seed=42,
        early_stopping_rounds=50,
        eval_metric='Accuracy',
        verbose=0
        )

cv = KFold(n_splits=5)
iters = np.empty(5)

for idx, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    fx_train, fx_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    fy_train, fy_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model.fit(
        fx_train,
        fy_train,
        eval_set=[(fx_test, fy_test)],
        verbose=False,
    )
    iters[idx] = model.get_best_iteration()

cb_iters = np.mean(iters)
print(cb_iters)

9.0


In [None]:
#LGB
model = LGBMClassifier(
        objective="binary", 
        random_state=42, 
        verbosity=-1, 
        early_stopping_rounds=50,
        n_estimators=10000,
		learning_rate=0.09554692701030004,
		num_leaves=2860,
		max_depth=3,
		min_data_in_leaf=9,
		lambda_l1=0,
		lambda_l2=18,
		min_gain_to_split=4.111913891598791,
		bagging_fraction=1.0,
		bagging_freq=15,
		feature_fraction=0.6,
		subsample=1.0
        )

cv = KFold(n_splits=5)
iters = np.empty(5)

for idx, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    fx_train, fx_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    fy_train, fy_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model.fit(
        fx_train,
        fy_train,
        eval_set=[(fx_test, fy_test)],
        eval_metric="binary_error",
        verbose=False,
    )
    iters[idx] = model.best_iteration_

lgb_iters = np.mean(iters)
print(lgb_iters)

#### Testing on test set

In [29]:
xgb = XGBClassifier(
        objective='binary:logistic', 
		learning_rate=0.060726041010120094,
		max_depth=4,
		min_child_weight=9,
		reg_lambda=2,
		reg_alpha=0,
		gamma=2,
		colsample_bytree=0.6,
		subsample=0.5,
        verbosity=0,
        n_jobs=4,
        random_state=42,
        n_estimators=28
    )

xgb.fit(x_train, y_train, verbose=False)
preds = xgb.predict(x_test)
print(accuracy_score(y_test, preds))

0.54


In [30]:
cb = CatBoostClassifier(
        learning_rate= 0.05351839411162072,
		max_depth= 4,
        min_data_in_leaf= 31,
		l2_leaf_reg= 3,
		colsample_bylevel= 1.0,
		subsample= 0.5,
		random_strength= 0.8,
		bagging_temperature= 10.25,
        iterations=9,
        loss_function='Logloss',
        random_seed=42,
        verbose=0
)   

cb.fit(x_train, y_train, verbose=False)
preds = [True if item=='True' else False for item in cb.predict(x_test)]
print(accuracy_score(y_test, preds))

0.52


In [31]:
lgb = LGBMClassifier(
        objective="binary", 
        random_state=42, 
        verbosity=-1, 
        n_estimators=6,
		learning_rate=0.09554692701030004,
		num_leaves=2860,
		max_depth=3,
		min_data_in_leaf=9,
		lambda_l1=0,
		lambda_l2=18,
		min_gain_to_split=4.111913891598791,
		bagging_fraction=1.0,
		bagging_freq=15,
		feature_fraction=0.6,
		subsample=1.0
)

lgb.fit(x_train, y_train.values.ravel())
preds = lgb.predict(x_test)
print(accuracy_score(y_test, preds))

0.55


### Experiment 3. Different Data split (Same 100 for test, 10 fold -> 50 for validation) --> No particular effect (Deleted code cells since mainly was just copypasting from previous experiments with minor code changes)

### Experiment 4. Tried to change the eval_metric to logloss --> No particular effect (Deleted code cells since mainly was just copypasting from previous experiments with minor code changes)