In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../buckets/b1/exp/downsampled_dataset_final.csv')
df.drop(columns=['azar'], inplace=True)

In [3]:
train_data = df[df['fold_train'] ==1]
valid_data = df[df['fold_validate'] ==1]
test_data = df[df['fold_test'] ==1]

In [4]:
train_data['clase_ternaria'].value_counts()

clase_ternaria
0    14042
2     7219
1     6823
Name: count, dtype: int64

In [5]:
import random
import pandas as pd

import time
import lightgbm as lgb
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


def ganancia_custom(y_true, y_pred):
    conditions = [
        (y_pred == 1) & (y_true == 2),
        (y_pred == 1) & (y_true != 2),
        (y_pred == 0)
    ]
    choices = [117000, -3000, 0]
    gan = np.select(conditions, choices)
    total_gan = np.sum(gan)
    return total_gan


def rolling_window_df(df, window_size=0.3, step_size=0.10, max_datasets=6):
    num_cols = len(df.columns)
    window_cols = int(num_cols * window_size)
    step_cols = int(num_cols * step_size)
    
    result = []
    
    for start_col in range(0, num_cols, step_cols):
        if max_datasets and len(result) >= max_datasets:
            break
            
        end_col = start_col + window_cols
        if end_col > num_cols:
            break
            
        result.append(df.iloc[:, start_col:end_col])        
    return result


def lgb_objective(trial, X_train, y_train):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'custom',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'force_row_wise': True,
        'verbosity': -100,
        'max_depth': -1,
        'min_gain_to_split': 0.0,
        'min_sum_hessian_in_leaf': 0.001,
        'lambda_l1': 0.0,
        'lambda_l2': 0.0,
        'max_bin': 31,
        #'num_iterations': 9999,
        'bagging_fraction': 1.0,
        'pos_bagging_fraction': 1.0,
        'neg_bagging_fraction': 1.0,
        'is_unbalance': False,
        'scale_pos_weight': 1.0,
        'drop_rate': 0.1,
        'max_drop': 50,
        'skip_drop': 0.5,
        'extra_trees': True,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.01, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 4, 1024),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 10000),
    }
    scores = cross_val_score(lgb.LGBMClassifier(**params), X_train, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

def etc_objective(trial, X_train, y_train):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
    }
    scores = cross_val_score(ExtraTreesClassifier(**params), X_train, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

def experiments(X_train, y_train, X_test, y_test, seeds, n_trials=1, n_top_range=range(5000, 15001, 1000)):
    results_df = pd.DataFrame(columns=['seed', 'model', 'training_time', 'f1_score', 'mcc', 'auc', 'ganancia_custom'])
    imputer = SimpleImputer(strategy='mean')   
    
    y_test_baja = y_test.copy()
    y_train = y_train.replace({2:1})      
    y_test = y_test.replace({2:1})
    
    X_test = X_test[X_train.columns]    

    for seed in seeds:    
        np.random.seed(seed)
        print(f'Experiment with seed {seed}')
        
        
        print('LightGBM HPO')
        start_time = time.time()
        lgb_study = optuna.create_study(direction='maximize')
        lgb_study.optimize(lambda trial: lgb_objective(trial, X_train, y_train), n_trials=n_trials)
        lgb_model = lgb.LGBMClassifier(**lgb_study.best_params)
        lgb_model.fit(X_train, y_train)
        training_time = time.time() - start_time     

        ganancia_custom_values = []
        for n_top in n_top_range:
            y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]
            threshold = np.sort(y_pred_proba)[-n_top]
            y_pred = (y_pred_proba >= threshold).astype(int)

            ganancia_custom_values.append(ganancia_custom(y_test_baja, y_pred))

        results = pd.DataFrame({
            'seed': [seed],
            'model': ['LightGBM HPO'],
            'training_time': [training_time],
            'f1_score': [f1_score(y_test, y_pred)],
            'mcc': [matthews_corrcoef(y_test, y_pred)],
            'auc': [roc_auc_score(y_test, y_pred_proba)],
            'ganancia_custom': [ganancia_custom_values]
        })
        
        print(f'LightGBM HPO con trials={n_trials} tardo : {training_time}')

        results_df = pd.concat([results_df, results], ignore_index=True)
        
        ## fill the nulls
        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)
        
        
        
        print('ExtraTreesDefault')
      

        start_time = time.time()
        etc_model_default = ExtraTreesClassifier(random_state=seed,n_jobs=-1)
        etc_model_default.fit(X_train, y_train)
        training_time = time.time() - start_time

        ganancia_custom_values = []
        for n_top in n_top_range:
            y_pred_proba = etc_model_default.predict_proba(X_test)[:, 1]
            threshold = np.sort(y_pred_proba)[-n_top]
            y_pred = (y_pred_proba >= threshold).astype(int)

            ganancia_custom_values.append(ganancia_custom(y_test_baja, y_pred))

        results = pd.DataFrame({
            'seed': [seed],
            'model': ['ExtraTreesDefault'],
            'training_time': [training_time],
            'f1_score': [f1_score(y_test, y_pred)],
            'mcc': [matthews_corrcoef(y_test, y_pred)],
            'auc': [roc_auc_score(y_test, y_pred_proba)],
            'ganancia_custom': [ganancia_custom_values]
        })
        print(f'ExtraTreesDefault tardo : {training_time}')

        results_df = pd.concat([results_df, results], ignore_index=True)        
        
        print('RandomForestDefault')
        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)

        start_time = time.time()
        etc_model_default = RandomForestClassifier(random_state=seed,n_jobs=-1)
        etc_model_default.fit(X_train, y_train)
        training_time = time.time() - start_time

        ganancia_custom_values = []
        for n_top in n_top_range:
            y_pred_proba = etc_model_default.predict_proba(X_test)[:, 1]
            threshold = np.sort(y_pred_proba)[-n_top]
            y_pred = (y_pred_proba >= threshold).astype(int)

            ganancia_custom_values.append(ganancia_custom(y_test_baja, y_pred))

        results = pd.DataFrame({
            'seed': [seed],
            'model': ['RandomForestDefault'],
            'training_time': [training_time],
            'f1_score': [f1_score(y_test, y_pred)],
            'mcc': [matthews_corrcoef(y_test, y_pred)],
            'auc': [roc_auc_score(y_test, y_pred_proba)],
            'ganancia_custom': [ganancia_custom_values]
        })
        print(f'RandomForestDefault tardo : {training_time}')

        results_df = pd.concat([results_df, results], ignore_index=True)
        
    
    return results_df


In [6]:
X_valid = valid_data.drop(columns='clase_ternaria')
y_valid = valid_data['clase_ternaria']

X_test = test_data.drop(columns='clase_ternaria')
y_test = test_data['clase_ternaria']

X_train = train_data.drop(columns='clase_ternaria')
y_train = train_data['clase_ternaria']

## get feature importance

In [7]:
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)
importances_lgbm = lgbm_model.feature_importances_
feature_importances_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances_lgbm})
feature_importances_df.sort_values('importance', ascending=False, inplace=True)
X_train.columns = feature_importances_df['feature'].values

## build datasets

In [8]:
## datasets
dataset_1 = X_train.iloc[:,:int(X_train.shape[1]*0.3)]
dataset_2 = X_train.iloc[:,int(X_train.shape[1]*0.6):]
dataset_3 = X_train[random.sample(X_train.columns.to_list(), k=int(X_train.shape[1]*0.3))]

dataset_rolling = rolling_window_df(X_train)
dataset_4 = dataset_rolling[1]
dataset_5 = dataset_rolling[2]
dataset_6 = dataset_rolling[3]
dataset_7 = dataset_rolling[4]
dataset_8 = dataset_rolling[5]

dataset_9 = pd.concat([X_train.iloc[:,:int(X_train.shape[1]*0.15)],X_train.iloc[:,int(X_train.shape[1]*0.85):]])
dataset_10 = X_train[list(set(X_train.columns) - set(dataset_9.columns))]
dataset_11 = X_train

# experiments

In [16]:
seeds = [677213, 727817,311237, 660719, 106427]

In [17]:
seeds = [x*2 for x in seeds] + [x*3 for x in seeds] + [x*4 for x in seeds] 

In [19]:
len(seeds)

15

In [None]:
datasets = [dataset_1, dataset_2,dataset_3,dataset_4,dataset_5,dataset_6,dataset_7,dataset_8,dataset_9,dataset_10,dataset_11]
results = []

for i, dataset in enumerate(datasets, start=1):
    result = experiments(dataset, y_train, X_test, y_test, seeds, n_trials=50)
    result.to_csv(f'../../buckets/b1/exp/results_exp/exp_seeds_{i}.csv')
    results.append(result)

[32m[I 2023-05-23 00:42:05,113][0m A new study created in memory with name: no-name-6eaf84a2-ea7e-4ac3-a8db-2a2279a72f96[0m


Experiment with seed 1354426
LightGBM HPO


[32m[I 2023-05-23 00:42:05,669][0m Trial 0 finished with value: 0.8750204562979396 and parameters: {'learning_rate': 0.03050654674153503, 'feature_fraction': 0.9595511943484935, 'num_leaves': 148, 'min_data_in_leaf': 8337}. Best is trial 0 with value: 0.8750204562979396.[0m




[32m[I 2023-05-23 00:42:06,310][0m Trial 1 finished with value: 0.878649323236754 and parameters: {'learning_rate': 0.2446365032149521, 'feature_fraction': 0.12348467500490463, 'num_leaves': 383, 'min_data_in_leaf': 6717}. Best is trial 1 with value: 0.878649323236754.[0m




[32m[I 2023-05-23 00:42:06,956][0m Trial 2 finished with value: 0.8944718754893523 and parameters: {'learning_rate': 0.13652152803326276, 'feature_fraction': 0.04887373447080844, 'num_leaves': 417, 'min_data_in_leaf': 2772}. Best is trial 2 with value: 0.8944718754893523.[0m




[32m[I 2023-05-23 00:42:07,689][0m Trial 3 finished with value: 0.8385575748287627 and parameters: {'learning_rate': 0.15162597122593674, 'feature_fraction': 0.3932000338844786, 'num_leaves': 535, 'min_data_in_leaf': 9158}. Best is trial 2 with value: 0.8944718754893523.[0m




[32m[I 2023-05-23 00:42:08,702][0m Trial 4 finished with value: 0.9170774510805842 and parameters: {'learning_rate': 0.17864379476974088, 'feature_fraction': 0.4447013145189686, 'num_leaves': 609, 'min_data_in_leaf': 2372}. Best is trial 4 with value: 0.9170774510805842.[0m




[32m[I 2023-05-23 00:42:09,708][0m Trial 5 finished with value: 0.9201228619621932 and parameters: {'learning_rate': 0.19101075559490843, 'feature_fraction': 0.07712355985826164, 'num_leaves': 569, 'min_data_in_leaf': 451}. Best is trial 5 with value: 0.9201228619621932.[0m




[32m[I 2023-05-23 00:42:10,308][0m Trial 6 finished with value: 0.8886411961457984 and parameters: {'learning_rate': 0.19504962289190145, 'feature_fraction': 0.8488730069642415, 'num_leaves': 865, 'min_data_in_leaf': 6708}. Best is trial 5 with value: 0.9201228619621932.[0m




[32m[I 2023-05-23 00:42:10,915][0m Trial 7 finished with value: 0.884346674882086 and parameters: {'learning_rate': 0.08128043523338452, 'feature_fraction': 0.8538633115144444, 'num_leaves': 1017, 'min_data_in_leaf': 7491}. Best is trial 5 with value: 0.9201228619621932.[0m




[32m[I 2023-05-23 00:42:11,503][0m Trial 8 finished with value: 0.8281812670064235 and parameters: {'learning_rate': 0.10492335556716148, 'feature_fraction': 0.041850415171505596, 'num_leaves': 525, 'min_data_in_leaf': 8383}. Best is trial 5 with value: 0.9201228619621932.[0m




[32m[I 2023-05-23 00:42:12,217][0m Trial 9 finished with value: 0.5 and parameters: {'learning_rate': 0.032486452268975057, 'feature_fraction': 0.5773358228451937, 'num_leaves': 466, 'min_data_in_leaf': 9598}. Best is trial 5 with value: 0.9201228619621932.[0m




[32m[I 2023-05-23 00:42:13,074][0m Trial 10 finished with value: 0.9238793524609963 and parameters: {'learning_rate': 0.29889737558768287, 'feature_fraction': 0.22551186936103967, 'num_leaves': 7, 'min_data_in_leaf': 996}. Best is trial 10 with value: 0.9238793524609963.[0m




[32m[I 2023-05-23 00:42:14,107][0m Trial 11 finished with value: 0.9289553965849495 and parameters: {'learning_rate': 0.29493176107058006, 'feature_fraction': 0.23443727612654666, 'num_leaves': 16, 'min_data_in_leaf': 174}. Best is trial 11 with value: 0.9289553965849495.[0m




[32m[I 2023-05-23 00:42:15,490][0m Trial 12 finished with value: 0.9295138382117402 and parameters: {'learning_rate': 0.29627269707329484, 'feature_fraction': 0.26574891009386015, 'num_leaves': 33, 'min_data_in_leaf': 153}. Best is trial 12 with value: 0.9295138382117402.[0m




[32m[I 2023-05-23 00:42:16,362][0m Trial 13 finished with value: 0.9065374835521779 and parameters: {'learning_rate': 0.2997016398723544, 'feature_fraction': 0.26783670999220627, 'num_leaves': 186, 'min_data_in_leaf': 4102}. Best is trial 12 with value: 0.9295138382117402.[0m




In [48]:
results[0]

Unnamed: 0,seed,model,training_time,f1_score,mcc,auc,ganancia_custom
0,677213,LightGBM HPO,3.700269,0.013718,-0.016869,0.525293,"[-12252000, -14880000, -17208000, -19683000, -..."
1,677213,ExtraTreesDefault,0.604618,0.038049,0.020715,0.582855,"[-6594000, -6594000, -9942000, -9942000, -1381..."
2,677213,RandomForestDefault,1.094883,0.039401,0.022939,0.646106,"[-10401000, -10401000, -16665000, -16665000, -..."
