In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../buckets/b1/exp/TS5410/dataset_training.csv.gz')

In [3]:
df['clase_ternaria'].replace({'CONTINUA':0, 'BAJA+2':1, 'BAJA+1':1}, inplace=True)

In [4]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

def undersample_majority(df, target_column):
    rus = RandomUnderSampler(random_state=42)
    X_undersampled, y_undersampled = rus.fit_resample(df.drop(target_column, axis=1), df[target_column])
    df_undersampled = pd.DataFrame(X_undersampled, columns=df.columns.drop(target_column))
    df_undersampled[target_column] = y_undersampled
    return df_undersampled

def rolling_window_df(df, window_size=0.3, step_size=0.10, max_datasets=6):
    num_cols = len(df.columns)
    window_cols = int(num_cols * window_size)
    step_cols = int(num_cols * step_size)
    
    result = []
    
    for start_col in range(0, num_cols, step_cols):
        if max_datasets and len(result) >= max_datasets:
            break
            
        end_col = start_col + window_cols
        if end_col > num_cols:
            break
            
        result.append(df.iloc[:, start_col:end_col])        
    return result

In [5]:
df_undersampled = undersample_majority(df, 'clase_ternaria')

train_data = df_undersampled[df_undersampled['fold_test'] !=1]
test_data = df_undersampled[df_undersampled['fold_test'] ==1]

  df_undersampled[target_column] = y_undersampled


In [None]:
# train_data = df[df['fold_test'] !=1]
# test_data = df[df['fold_test'] ==1]

In [6]:
X = train_data.drop(columns='clase_ternaria')
y = train_data['clase_ternaria']

In [7]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

rf_model = RandomForestClassifier()
lgbm_model = LGBMClassifier()
et_model = ExtraTreesClassifier()

#rf_model.fit(X_imputed, y)
lgbm_model.fit(X_imputed, y)
et_model.fit(X_imputed, y)

#importances_rf = rf_model.feature_importances_
importances_lgbm = lgbm_model.feature_importances_
importances_et = et_model.feature_importances_

In [None]:
importances_mean = (importances_rf + importances_lgbm + importances_et) / 3

In [10]:
feature_importances_df = pd.DataFrame({'feature': X_imputed.columns, 'importance': importances_lgbm})
feature_importances_df.sort_values('importance', ascending=False, inplace=True)

In [25]:
X.columns = feature_importances_df['feature'].values

In [100]:
## datasets
dataset_1 = X.iloc[:,:int(X.shape[1]*0.3)]
dataset_2 = X.iloc[:,int(X.shape[1]*0.6):]
dataset_3 = X[random_values]
dataset_4 = dataset_rolling[1]
dataset_5 = dataset_rolling[2]
dataset_6 = dataset_rolling[3]
dataset_7 = dataset_rolling[4]
dataset_8 = dataset_rolling[5]
dataset_9 = pd.concat([X.iloc[:,:int(X.shape[1]*0.15)],X.iloc[:,int(X.shape[1]*0.85):]])
dataset_10 = X[list(set(X.columns) - set(dataset_9.columns))]

# experiments

In [101]:
import time
import lightgbm as lgb
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
from sklearn.metrics import make_scorer

In [102]:
def custom_metric(y_true, y_pred):
    # Replace with your own metric calculation
    return np.mean(y_true == y_pred)


In [None]:
def lgb_objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'custom',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'force_row_wise': True,
        'verbosity': -100,
        'max_depth': -1,
        'min_gain_to_split': 0.0,
        'min_sum_hessian_in_leaf': 0.001,
        'lambda_l1': 0.0,
        'lambda_l2': 0.0,
        'max_bin': 31,
        'num_iterations': 9999,
        'bagging_fraction': 1.0,
        'pos_bagging_fraction': 1.0,
        'neg_bagging_fraction': 1.0,
        'is_unbalance': False,
        'scale_pos_weight': 1.0,
        'drop_rate': 0.1,
        'max_drop': 50,
        'skip_drop': 0.5,
        'extra_trees': True,
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.01, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 4, 1024),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50000),
    }

    scores = cross_val_score(lgb.LGBMClassifier(**params), X, y, cv=5, scoring='roc_auc')
    return scores.mean()

def etc_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
    }

    scores = cross_val_score(ExtraTreesClassifier(**params), X, y, cv=5, scoring='roc_auc')
    return scores.mean()

In [104]:
seeds = [0, 1, 2, 3, 4]

In [107]:
def experiments(X, y):
    results_df = pd.DataFrame(columns=['seed', 'model', 'training_time', 'f1_score', 'mcc', 'auc', 'custom_metric'])

    for seed in seeds:    
        np.random.seed(seed)

        start_time = time.time()
        lgb_study = optuna.create_study(direction='maximize')
        lgb_study.optimize(lgb_objective, n_trials=50)
        lgb_model = lgb.LGBMClassifier(**lgb_study.best_params)
        lgb_model.fit(X, y)
        training_time = time.time() - start_time

        # Compute metrics
        y_pred = lgb_model.predict(X)
        results_df = results_df.append({
            'seed': seed,
            'model': 'LightGBM',
            'training_time': training_time,
            'f1_score': f1_score(y, y_pred),
            'mcc': matthews_corrcoef(y, y_pred),
            'auc': roc_auc_score(y, lgb_model.predict_proba(X)[:, 1]),
            'custom_metric': custom_metric(y, y_pred)
        }, ignore_index=True)

        # Train and optimize Extremely Randomized Trees model
        start_time = time.time()
        etc_study = optuna.create_study(direction='maximize')
        etc_study.optimize(etc_objective, n_trials=50)
        etc_model = ExtraTreesClassifier(**etc_study.best_params)
        etc_model.fit(X, y)
        training_time = time.time() - start_time

        # Compute metrics
        y_pred = etc_model.predict(X)
        results_df = results_df.append({
            'seed': seed,
            'model': 'ExtraTrees',
            'training_time': training_time,
            'f1_score': f1_score(y, y_pred),
            'mcc': matthews_corrcoef(y, y_pred),
            'auc': roc_auc_score(y, etc_model.predict_proba(X)[:, 1]),
            'custom_metric': custom_metric(y, y_pred)
        }, ignore_index=True)

        # Train Extremely Randomized Trees model with default parameters
        start_time = time.time()
        etc_model_default = ExtraTreesClassifier(random_state=seed)
        etc_model_default.fit(X, y)
        training_time = time.time() - start_time

        # Compute metrics
        y_pred = etc_model_default.predict(X)
        results_df = results_df.append({
            'seed': seed,
            'model': 'ExtraTreesDefault',
            'training_time': training_time,
            'f1_score': f1_score(y, y_pred),
            'mcc': matthews_corrcoef(y, y_pred),
            'auc': roc_auc_score(y, etc_model_default.predict_proba(X)[:, 1]),
            'custom_metric': custom_metric(y, y_pred)
        }, ignore_index=True)

    return results_df

In [None]:
results_experiment_1 = experiments(dataset_1, y)

[32m[I 2023-05-19 13:45:31,169][0m A new study created in memory with name: no-name-4ae75666-3c01-49ea-93eb-b5997d2f4652[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:45:44,083][0m Trial 0 finished with value: 0.837015578773633 and parameters: {'lambda_l1': 8.130494584029792, 'lambda_l2': 0.00012840869774737097, 'num_leaves': 20, 'feature_fraction': 0.44685446179702293, 'bagging_fraction': 0.9646990168267559, 'bagging_freq': 3, 'min_child_samples': 40}. Best is trial 0 with value: 0.837015578773633.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:46:13,266][0m Trial 1 finished with value: 0.8449213956450817 and parameters: {'lambda_l1': 0.00042994168530016545, 'lambda_l2': 2.018559578612217e-06, 'num_leaves': 160, 'feature_fraction': 0.8684065690253492, 'bagging_fraction': 0.4466508459700563, 'bagging_freq': 3, 'min_child_samples': 79}. Best is trial 1 with value: 0.8449213956450817.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:46:51,544][0m Trial 2 finished with value: 0.8345574080516946 and parameters: {'lambda_l1': 1.1842293057475537e-06, 'lambda_l2': 0.12120758403564824, 'num_leaves': 236, 'feature_fraction': 0.8055161041151906, 'bagging_fraction': 0.6973374544557829, 'bagging_freq': 6, 'min_child_samples': 72}. Best is trial 1 with value: 0.8449213956450817.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:48:18,044][0m Trial 3 finished with value: 0.8266837326750153 and parameters: {'lambda_l1': 0.0014611489766776238, 'lambda_l2': 0.0003031203578432794, 'num_leaves': 214, 'feature_fraction': 0.9045130843753239, 'bagging_fraction': 0.8810524933406793, 'bagging_freq': 1, 'min_child_samples': 16}. Best is trial 1 with value: 0.8449213956450817.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:49:19,859][0m Trial 4 finished with value: 0.8518034796730648 and parameters: {'lambda_l1': 3.663684225200074e-06, 'lambda_l2': 2.7804772048350913e-06, 'num_leaves': 150, 'feature_fraction': 0.9005295030172201, 'bagging_fraction': 0.5439622105314873, 'bagging_freq': 1, 'min_child_samples': 15}. Best is trial 4 with value: 0.8518034796730648.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:49:37,420][0m Trial 5 finished with value: 0.8157583711378982 and parameters: {'lambda_l1': 2.4442713928430295e-06, 'lambda_l2': 4.480591492933593e-06, 'num_leaves': 21, 'feature_fraction': 0.9345306889851862, 'bagging_fraction': 0.7784729085447559, 'bagging_freq': 3, 'min_child_samples': 32}. Best is trial 4 with value: 0.8518034796730648.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:50:11,939][0m Trial 6 finished with value: 0.8596182060142453 and parameters: {'lambda_l1': 1.1844942681194566e-08, 'lambda_l2': 0.019038574715506126, 'num_leaves': 220, 'feature_fraction': 0.5670557015644967, 'bagging_fraction': 0.43891690997526206, 'bagging_freq': 1, 'min_child_samples': 41}. Best is trial 6 with value: 0.8596182060142453.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:50:39,885][0m Trial 7 finished with value: 0.8469525278504282 and parameters: {'lambda_l1': 0.00022239225679499932, 'lambda_l2': 0.573147404656244, 'num_leaves': 97, 'feature_fraction': 0.7667639028475179, 'bagging_fraction': 0.5001645695915331, 'bagging_freq': 3, 'min_child_samples': 62}. Best is trial 6 with value: 0.8596182060142453.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),




[32m[I 2023-05-19 13:51:04,396][0m Trial 8 finished with value: 0.8200412617601118 and parameters: {'lambda_l1': 1.5024513433804834e-05, 'lambda_l2': 4.162951374814741, 'num_leaves': 45, 'feature_fraction': 0.7440264732541004, 'bagging_fraction': 0.8526132460817514, 'bagging_freq': 3, 'min_child_samples': 52}. Best is trial 6 with value: 0.8596182060142453.[0m
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),


