In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../buckets/b1/exp/TS5410/dataset_training.csv.gz')

In [3]:
df['clase_ternaria'].replace({'CONTINUA':0, 'BAJA+2':1, 'BAJA+1':1}, inplace=True)

In [20]:
import random
import pandas as pd

import time
import lightgbm as lgb
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
from sklearn.metrics import make_scorer


def undersample_majority(df, target_column):
    rus = RandomUnderSampler(random_state=42)
    X_undersampled, y_undersampled = rus.fit_resample(df.drop(target_column, axis=1), df[target_column])
    df_undersampled = pd.DataFrame(X_undersampled, columns=df.columns.drop(target_column))
    df_undersampled[target_column] = y_undersampled
    return df_undersampled

def rolling_window_df(df, window_size=0.3, step_size=0.10, max_datasets=6):
    num_cols = len(df.columns)
    window_cols = int(num_cols * window_size)
    step_cols = int(num_cols * step_size)
    
    result = []
    
    for start_col in range(0, num_cols, step_cols):
        if max_datasets and len(result) >= max_datasets:
            break
            
        end_col = start_col + window_cols
        if end_col > num_cols:
            break
            
        result.append(df.iloc[:, start_col:end_col])        
    return result


def lgb_objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'custom',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'force_row_wise': True,
        'verbosity': -100,
        'max_depth': -1,
        'min_gain_to_split': 0.0,
        'min_sum_hessian_in_leaf': 0.001,
        'lambda_l1': 0.0,
        'lambda_l2': 0.0,
        'max_bin': 31,
        'num_iterations': 9999,
        'bagging_fraction': 1.0,
        'pos_bagging_fraction': 1.0,
        'neg_bagging_fraction': 1.0,
        'is_unbalance': False,
        'scale_pos_weight': 1.0,
        'drop_rate': 0.1,
        'max_drop': 50,
        'skip_drop': 0.5,
        'extra_trees': True,
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.01, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 4, 1024),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50000),
    }

    scores = cross_val_score(lgb.LGBMClassifier(**params), X, y, cv=5, scoring='roc_auc')
    return scores.mean()

def etc_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
    }

    scores = cross_val_score(ExtraTreesClassifier(**params), X, y, cv=5, scoring='roc_auc')
    return scores.mean()


def experiments(X, y, seeds):
    results_df = pd.DataFrame(columns=['seed', 'model', 'training_time', 'f1_score', 'mcc', 'auc', 'custom_metric'])

    for seed in seeds:    
        np.random.seed(seed)

        start_time = time.time()
        lgb_study = optuna.create_study(direction='maximize')
        lgb_study.optimize(lgb_objective, n_trials=50)
        lgb_model = lgb.LGBMClassifier(**lgb_study.best_params)
        lgb_model.fit(X, y)
        training_time = time.time() - start_time

 
        y_pred = lgb_model.predict(X)
        results_df = results_df.append({
            'seed': seed,
            'model': 'LightGBM',
            'training_time': training_time,
            'f1_score': f1_score(y, y_pred),
            'mcc': matthews_corrcoef(y, y_pred),
            'auc': roc_auc_score(y, lgb_model.predict_proba(X)[:, 1]),
            'custom_metric': custom_metric(y, y_pred)
        }, ignore_index=True)

      
        start_time = time.time()
        etc_study = optuna.create_study(direction='maximize')
        etc_study.optimize(etc_objective, n_trials=50)
        etc_model = ExtraTreesClassifier(**etc_study.best_params)
        etc_model.fit(X, y)
        training_time = time.time() - start_time

      
        y_pred = etc_model.predict(X)
        results_df = results_df.append({
            'seed': seed,
            'model': 'ExtraTrees',
            'training_time': training_time,
            'f1_score': f1_score(y, y_pred),
            'mcc': matthews_corrcoef(y, y_pred),
            'auc': roc_auc_score(y, etc_model.predict_proba(X)[:, 1]),
            'custom_metric': custom_metric(y, y_pred)
        }, ignore_index=True)

        
        start_time = time.time()
        etc_model_default = ExtraTreesClassifier(random_state=seed)
        etc_model_default.fit(X, y)
        training_time = time.time() - start_time

  
        y_pred = etc_model_default.predict(X)
        results_df = results_df.append({
            'seed': seed,
            'model': 'ExtraTreesDefault',
            'training_time': training_time,
            'f1_score': f1_score(y, y_pred),
            'mcc': matthews_corrcoef(y, y_pred),
            'auc': roc_auc_score(y, etc_model_default.predict_proba(X)[:, 1]),
            'custom_metric': custom_metric(y, y_pred)
        }, ignore_index=True)

    return results_df

In [5]:
df_undersampled = undersample_majority(df, 'clase_ternaria')

train_data = df_undersampled[df_undersampled['fold_test'] !=1]
test_data = df_undersampled[df_undersampled['fold_test'] ==1]

  df_undersampled[target_column] = y_undersampled


In [6]:
# train_data = df[df['fold_test'] !=1]
# test_data = df[df['fold_test'] ==1]

In [7]:
X = train_data.drop(columns='clase_ternaria')
y = train_data['clase_ternaria']

In [None]:
## get feature importance

In [9]:
lgbm_model = LGBMClassifier()
lgbm_model.fit(X, y)
importances_lgbm = lgbm_model.feature_importances_
feature_importances_df = pd.DataFrame({'feature': X.columns, 'importance': importances_lgbm})
feature_importances_df.sort_values('importance', ascending=False, inplace=True)
X.columns = feature_importances_df['feature'].values

## build datasets

In [22]:
## datasets
dataset_1 = X.iloc[:,:int(X.shape[1]*0.3)]
dataset_2 = X.iloc[:,int(X.shape[1]*0.6):]
dataset_3 = X[random.sample(X.columns.to_list(), k=int(X.shape[1]*0.3))]

dataset_rolling = rolling_window_df(X)
dataset_4 = dataset_rolling[1]
dataset_5 = dataset_rolling[2]
dataset_6 = dataset_rolling[3]
dataset_7 = dataset_rolling[4]
dataset_8 = dataset_rolling[5]

dataset_9 = pd.concat([X.iloc[:,:int(X.shape[1]*0.15)],X.iloc[:,int(X.shape[1]*0.85):]])
dataset_10 = X[list(set(X.columns) - set(dataset_9.columns))]

# experiments

In [24]:
def custom_metric(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [27]:
seeds = [677213, 727817, 311237, 660719, 106427]

In [None]:
results_experiment_1 = experiments(dataset_1, y, seeds)

[32m[I 2023-05-19 14:33:30,873][0m A new study created in memory with name: no-name-846faa70-c1d6-435c-a9c6-4122f8136ee9[0m
  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.01, 1.0),














