In [1]:
import numpy as np
import pandas as pd
import tqdm 
import cProfile
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import optuna
import logging
import time
from scipy.stats import pearsonr

data = pd.read_feather("data/processed/massBV.feather")

In [2]:
def prep_data_before_train(data: pd.DataFrame, phenotype: str) -> tuple:
    """
    prepare data for training, returns target vector and covariate-matrix and ringnrs for grouping
    :param data: all data from dataloader script
    :param phenotype: the phenotype to be predicted
    :return: X, Y, ringnrs, X contain covariates and Y contains the phenotype
    """
    ringnrs = data.ringnr
    mean_pheno = data.mean_pheno
    X = data.drop(
        columns=[
            "ID",
            "mass",
            "tarsus",
            "ringnr",
            "mean_pheno",
            "IID",
            "FID",
            "MAT",
            "PAT",
            "SEX",
            "PHENOTYPE",
        ],
        errors="ignore",
    )

    try:
        Y = data.loc[:, phenotype]
    except KeyError:
        try:
            Y = data.ID
        except AttributeError:
            Y = data.mass
    del data
    try:
        X.hatchyear = X.hatchyear.astype("int")
        X.island_current = X.island_current.astype("int")
    except AttributeError:
        pass
    Y = (Y - np.mean(Y))/np.std(Y)
    X = X.fillna(0)
    X = X.T
    X = X.astype("int")
    X = X.T 
    return X, Y, ringnrs, mean_pheno

In [3]:
X, y, ringnrs, mean_pheno = prep_data_before_train(data, "mass")
del data
X.drop(columns = ["hatchisland"], inplace = True)
X["ringnr"] = ringnrs   

target = pd.DataFrame(y)
target["mean_pheno"] = mean_pheno
target["ringnr"] = ringnrs

In [4]:
def random_split(pheno:str = "mass", num_folds:int = 5, seed: int = 42) -> None:
    data = pd.read_feather(f"data/processed/{pheno}BV.feather")  
    x = data[["ringnr","hatchisland"]]
    del data
    np.random.seed(seed)
    #Split the data into num_folds folds with random sampling, each fold should have the same number of individuals
    #The folds should be saved in a new column called "fold" in the data frame

    x = x.sample(frac=1, random_state=seed).reset_index(drop=True)
    # Number of folds
    n_folds = num_folds

    # Create an array with fold numbers
    folds = np.array([i % n_folds for i in range(len(x))])

    # Shuffle the fold assignment array
    #np.random.shuffle(folds)

    # Assign the fold numbers to a new column in the dataframe
    x['fold'] = folds

    return x[['ringnr','fold']]

splits = random_split("mass", 10, 42)

In [5]:
X = pd.merge(X,splits, on = "ringnr", how = "inner") 
X = pd.merge(X,target, on = "ringnr", how = "inner")  

In [6]:
def subset(X:pd.DataFrame, seed:int = 42, snp_prop:float = 0.2) -> pd.DataFrame:
    """
    Function to subset the data into a smaller dataset for testing purposes, 
    sampling only a subset of the features
    """
    np.random.seed(seed)
    sample_columns = np.random.choice(X.columns[:-4], int(len(X.columns[:-4])*snp_prop), replace = False)
    sample_columns = np.append(sample_columns, ["ringnr","fold", "ID", "mean_pheno"])

    return X[sample_columns]

In [7]:
X = subset(X, snp_prop = 0.1)

In [8]:
X

Unnamed: 0,SNPa352024_T,SNPa51204_C,SNPa371349_T,SNPa86522_T,SNPa395916_T,SNPa302259_A,SNPa524601_A,SNPa483891_T,SNPa207156_T,SNPa77054_G,...,SNPa437479_G,SNPa179086_T,SNPa392716_T,SNPa415615_G,SNPa280074_G,SNPa220371_G,ringnr,fold,ID,mean_pheno
0,0,0,1,2,1,0,0,1,0,0,...,1,1,1,1,0,0,8118424,8,0.519577,32.600000
1,0,0,1,2,0,0,0,1,0,1,...,1,0,1,1,1,0,8118425,4,-1.044296,30.300000
2,0,0,0,1,0,0,0,0,0,1,...,2,1,1,0,0,1,8118426,7,0.918158,33.316667
3,0,0,0,0,0,0,0,1,0,1,...,1,1,1,1,0,0,8118429,1,-0.093239,31.300000
4,0,0,0,2,1,0,0,1,0,0,...,1,2,1,1,0,0,8118430,9,-1.010615,29.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,1,0,8N87368,5,0.523029,32.900000
1914,0,0,0,1,0,0,1,0,1,1,...,0,2,0,0,1,0,8N87369,5,-0.241355,30.900000
1915,0,0,0,0,1,1,0,0,1,2,...,1,1,0,0,0,1,8N87371,7,0.515544,32.500000
1916,0,0,0,1,0,0,0,0,1,1,...,0,1,0,0,0,1,8N87372,3,0.494145,32.800000


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# Replace with your model

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assuming your dataframe 'df' has a column 'island' with values 1 to 8 representing the islands
# and a 'fold' column representing the 8 folds, and a 'target' column for the target variable

# Function to perform the nested cross-validation
def nested_cv(df):
    
    #outer_results = []

    
    # Define the Optuna objective function
    def objective(trial):
        logger.info(f"Starting trial {trial.number}")

        # Define hyperparameter search space
        n_estimators = trial.suggest_int('n_estimators', 300, 1000)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        max_depth = trial.suggest_int('max_depth', 3, 13)
        num_leaves = trial.suggest_int('num_leaves', 20, 3000, step = 20)
        reg_lambda = trial.suggest_int('reg_lambda', 0, 100, step = 5)
        reg_alpha = trial.suggest_int('reg_alpha', 0 , 100, step = 5)
        bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9, step = 0.1)
        bagging_freq = trial.suggest_int('bagging_freq', 1, 10, step = 1)
        feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9, step = 0.1)

        #parameters
        params = {
            'objective': 'regression',
            'metric': 'mae',    
            #'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'num_leaves': num_leaves,
            'reg_lambda': reg_lambda,
            'reg_alpha': reg_alpha,
            'bagging_fraction': bagging_fraction,
            'bagging_freq': bagging_freq,
            'feature_fraction': feature_fraction,
            'verbose': -1
        }

        outer_results = []  
        folds = np.sort(df['fold'].unique())    
        for outer_fold in folds:
            #print(f"Outer CV: Test island {outer_fold}")
        
            # Split data into outer train and test sets
            outer_train_df = df[df['fold'] != outer_fold]
            outer_test_df = df[df['fold'] == outer_fold]

            # Extract features and target for outer training and test sets
            X_train_outer, y_train_outer = outer_train_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), outer_train_df['ID']
            X_test_outer, y_test_outer = outer_test_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), outer_test_df['mean_pheno']

            #split X_train_outer and y_train_outer into train and validation set
            X_train_outer, X_val_outer, y_train_outer, y_val_outer = train_test_split(X_train_outer, y_train_outer, test_size = 0.1, random_state = 42)

            # Train on the outer train set
            dtrain = lgb.Dataset(X_train_outer, label=y_train_outer)
            dval = lgb.Dataset(X_val_outer, label=y_val_outer, reference=dtrain)

            gbm = lgb.train(params, dtrain, num_boost_round=n_estimators, valid_sets=[dtrain,dval], valid_names=['train', 'valid'], callbacks=[lgb.early_stopping(stopping_rounds=100)])

            # Validate model
            y_pred_outer = gbm.predict(X_test_outer, num_iteration = gbm.best_iteration)
            corr, _ = pearsonr(y_test_outer, y_pred_outer)
            #print(y_pred_outer)
            #print()
            #print(y_test_outer)

            #print(f"Test correlation for island {outer_fold}: {outer_corr}")
            outer_results.append(corr)
        
        # Final results
        print(f"Results: {outer_results}")
        print(f"Average correlation across all outer folds: {np.mean(outer_results)}")
        return np.mean(outer_results)
    
    # Run Optuna for hyperparameter optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=15, n_jobs=4)

    return study


s = nested_cv(X)

            # Perform 7-fold CV on the 7 remaining islands
        #    inner_corr = []
        #    start_time = time.time()
        #    for inner_fold in folds:
        #        if inner_fold == outer_fold:
        #            continue  # Skip the current test island
        #        
        #        # Split inner train/validation data
        #        train_df = outer_train_df[outer_train_df['fold'] != inner_fold]
        #        val_df = outer_train_df[outer_train_df['fold'] == inner_fold]
        #        
        #        # Extract features and target
        #        X_train, y_train = train_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), train_df['ID']
        #        X_val, y_val = val_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), val_df['mean_pheno'] #Use this for computing Pearson correlation, this will be the performance metric for the inner loop#
        #
        #        # Train model
        #        dtrain = lgb.Dataset(X_train, label=y_train)
        #        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        #        gbm = lgb.train(params, dtrain, num_boost_round=n_estimators, valid_sets=[dval], callbacks=[lgb.early_stopping(stopping_rounds=20)])
                # Validate model
        #        y_pred = gbm.predict(X_val)
        #        corr = np.corrcoef(y_val, y_pred)[0, 1]
        #        inner_corr.append(corr)
                
        #        end_time = time.time()
        #    logger.info(f"Finished trial {trial.number} in {end_time - start_time:.2f} seconds")
            # Return the average MAE for this set of hyperparameters
        #    return np.mean(inner_corr)
        
        # Run Optuna for hyperparameter optimization
        #study = optuna.create_study(direction='maximize')
        #study.optimize(objective, n_trials=10, n_jobs=4)
        
        # Train the final model on the outer train set with the best hyperparameters
        #best_params = study.best_params

        #n_estimators = best_params['n_estimators']
        #best_params.pop('n_estimators')
        #best_params['objective'] = 'regression'
        #best_params['metric'] = 'mae'
        #best_params['verbose'] = -1
        
        # Extract features and target for outer training and test sets
        #X_train_outer, y_train_outer = outer_train_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), outer_train_df['ID']
        #X_test_outer, y_test_outer = outer_test_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), outer_test_df['mean_pheno']
        
        # Train on the outer train set
        #dtrain = lgb.Dataset(X_train_outer, label=y_train_outer)
        #dval = lgb.Dataset(X_test_outer, label=y_test_outer, reference=dtrain)  
        #gbm = lgb.train(best_params, dtrain, num_boost_round=n_estimators, valid_sets=[dval], callbacks=[lgb.early_stopping(stopping_rounds=20)])
        
        # Test on the outer test set
        #y_pred_outer = gbm.predict(X_test_outer)
        #outer_corr = np.corrcoef(y_test_outer, y_pred_outer)[0, 1]
        
        #print(f"Test correlation for island {outer_fold}: {outer_corr}")
        #outer_results.append(outer_corr)
    
    # Final results
    #print(f"Average correlation across all outer folds: {np.mean(outer_results)}")
    #return outer_results

# Example dataframe with 'island', 'fold', 'target', and other features



[I 2024-09-26 11:51:18,844] A new study created in memory with name: no-name-2e9c6589-7056-44f7-96ff-1052c4d23999
INFO:__main__:Starting trial 0
INFO:__main__:Starting trial 1
INFO:__main__:Starting trial 2
INFO:__main__:Starting trial 3


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[16]	train's l1: 0.770693	valid's l1: 0.798822
Early stopping, best iteration is:
[1]	train's l1: 0.771664	valid's l1: 0.799257


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[173]	train's l1: 0.585095	valid's l1: 0.757215
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.766692	valid's l1: 0.74104


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[570]	train's l1: 0.693655	valid's l1: 0.76943
Early stopping, best iteration is:
[8]	train's l1: 0.707099	valid's l1: 0.725577
Early stopping, best iteration is:
[536]	train's l1: 0.758255	valid's l1: 0.735442
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[67]	train's l1: 0.780211	valid's l1: 0.716737
Early stopping, best iteration is:
[16]	train's l1: 0.745656	valid's l1: 0.733299
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[50]	train's l1: 0.618251	valid's l1: 0.688111
Early stopping, best iteration is:
[1]	train's l1: 0.782413	valid's l1: 0.

  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[327]	train's l1: 0.772354	valid's l1: 0.711113
Early stopping, best iteration is:
[240]	train's l1: 0.699313	valid's l1: 0.689385
Early stopping, best iteration is:
[1]	train's l1: 0.779909	valid's l1: 0.72832


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[226]	train's l1: 0.563166	valid's l1: 0.683792
Early stopping, best iteration is:
[1]	train's l1: 0.778605	valid's l1: 0.785872


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[513]	train's l1: 0.698668	valid's l1: 0.687998
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[397]	train's l1: 0.770466	valid's l1: 0.710121
Early stopping, best iteration is:
[1]	train's l1: 0.787401	valid's l1: 0.667295


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[17]	train's l1: 0.682372	valid's l1: 0.704406
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.755971	valid's l1: 0.815445


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[316]	train's l1: 0.708864	valid's l1: 0.694954
Early stopping, best iteration is:
[26]	train's l1: 0.777028	valid's l1: 0.72553
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[155]	train's l1: 0.721878	valid's l1: 0.743235
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[201]	train's l1: 0.771957	valid's l1: 0.779583
Early stopping, best iteration is:
[1]	train's l1: 0.771604	valid's l1: 0.774018


  corr, _ = pearsonr(y_test_outer, y_pred_outer)
[W 2024-09-26 11:56:52,797] Trial 0 failed with parameters: {'n_estimators': 631, 'learning_rate': 0.17207605555757832, 'max_depth': 4, 'num_leaves': 760, 'reg_lambda': 75, 'reg_alpha': 75, 'bagging_fraction': 0.2, 'bagging_freq': 8, 'feature_fraction': 0.4} because of the following error: The value nan is not acceptable.
[W 2024-09-26 11:56:52,800] Trial 0 failed with value nan.
INFO:__main__:Starting trial 4


Results: [nan, nan, -0.006431058712651164, nan, nan, nan, nan, nan, nan, nan]
Average correlation across all outer folds: nan
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[409]	train's l1: 0.569164	valid's l1: 0.687824
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[255]	train's l1: 0.712102	valid's l1: 0.6462
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[153]	train's l1: 0.781099	valid's l1: 0.663924
Early stopping, best iteration is:
[238]	train's l1: 0.768099	valid's l1: 0.797173
Early stopping, best iteration is:
[67]	train's l1: 0.607602	valid's l1: 0.632316
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation sc

[I 2024-09-26 11:59:48,204] Trial 3 finished with value: 0.09956891916550838 and parameters: {'n_estimators': 857, 'learning_rate': 0.18252607689412392, 'max_depth': 11, 'num_leaves': 2520, 'reg_lambda': 25, 'reg_alpha': 90, 'bagging_fraction': 0.9, 'bagging_freq': 3, 'feature_fraction': 0.2}. Best is trial 3 with value: 0.09956891916550838.
INFO:__main__:Starting trial 5


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[281]	train's l1: 0.707304	valid's l1: 0.732063


[I 2024-09-26 12:00:09,935] Trial 2 finished with value: 0.1714361555153729 and parameters: {'n_estimators': 804, 'learning_rate': 0.26269868952390557, 'max_depth': 6, 'num_leaves': 1620, 'reg_lambda': 85, 'reg_alpha': 65, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 2 with value: 0.1714361555153729.
INFO:__main__:Starting trial 6


Results: [0.19310182554384653, 0.09277574803582715, 0.05987497621568082, 0.12440156466918639, 0.2820890293241878, 0.2659333227756377, 0.08056893628349371, 0.14081619561202088, 0.23941086565126224, 0.23538909104258599]
Average correlation across all outer folds: 0.1714361555153729
Early stopping, best iteration is:
[1]	train's l1: 0.771664	valid's l1: 0.799257


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[134]	train's l1: 0.766853	valid's l1: 0.719335
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	train's l1: 0.445935	valid's l1: 0.765527
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.766692	valid's l1: 0.74104


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[269]	train's l1: 0.582816	valid's l1: 0.703063


[I 2024-09-26 12:00:37,842] Trial 1 finished with value: 0.16854168703494252 and parameters: {'n_estimators': 773, 'learning_rate': 0.2878367209147297, 'max_depth': 12, 'num_leaves': 2040, 'reg_lambda': 40, 'reg_alpha': 45, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 2 with value: 0.1714361555153729.
INFO:__main__:Starting trial 7


Results: [0.20156695258525695, 0.13380045710090635, 0.03558873813868666, 0.1477064658688511, 0.16915883869057394, 0.25379607119089476, 0.2337346631559402, 0.19489454904670572, 0.06802642582120201, 0.2471437087504076]
Average correlation across all outer folds: 0.16854168703494252
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	train's l1: 0.773124	valid's l1: 0.782537
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[29]	train's l1: 0.614748	valid's l1: 0.722049
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.771664	valid's l1: 0.799257


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[137]	train's l1: 0.783063	valid's l1: 0.716877
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[140]	train's l1: 0.785354	valid's l1: 0.666681
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[79]	train's l1: 0.469675	valid's l1: 0.666589
Early stopping, best iteration is:
[1]	train's l1: 0.782413	valid's l1: 0.717614


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[102]	train's l1: 0.766249	valid's l1: 0.740342
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	train's l1: 0.558008	valid's l1: 0.684671
Early stopping, best iteration is:
[1]	train's l1: 0.779909	valid's l1: 0.72832


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[225]	train's l1: 0.751922	valid's l1: 0.811818
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[65]	train's l1: 0.783356	valid's l1: 0.716957
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[25]	train's l1: 0.634987	valid's l1: 0.695709
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.778605	valid's l1: 0.785872
Training until validation scores don't improve for 100 rounds


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[129]	train's l1: 0.767431	valid's l1: 0.769612
Early stopping, best iteration is:
[1]	train's l1: 0.787401	valid's l1: 0.667295


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[213]	train's l1: 0.781451	valid's l1: 0.717123
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	train's l1: 0.764546	valid's l1: 0.763123


[I 2024-09-26 12:03:28,488] Trial 4 finished with value: 0.08900123227520369 and parameters: {'n_estimators': 804, 'learning_rate': 0.19181087820624088, 'max_depth': 7, 'num_leaves': 260, 'reg_lambda': 90, 'reg_alpha': 60, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 10, 'feature_fraction': 0.2}. Best is trial 2 with value: 0.1714361555153729.
INFO:__main__:Starting trial 8


Results: [0.12596299633922714, 0.009233103074093363, 0.10247000968061586, 0.17295577805255785, 0.030760626231919802, 0.10107701229167529, -0.005524579277001382, 0.1086489630265533, 0.1253242604736549, 0.1191041528587409]
Average correlation across all outer folds: 0.08900123227520369
Early stopping, best iteration is:
[237]	train's l1: 0.275394	valid's l1: 0.730553
Early stopping, best iteration is:
[1]	train's l1: 0.755971	valid's l1: 0.815445


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[222]	train's l1: 0.777492	valid's l1: 0.725426
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[40]	train's l1: 0.593019	valid's l1: 0.639583
Early stopping, best iteration is:
[11]	train's l1: 0.778061	valid's l1: 0.785193
Early stopping, best iteration is:
[70]	train's l1: 0.77157	valid's l1: 0.799195
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[141]	train's l1: 0.77113	valid's l1: 0.773175


[W 2024-09-26 12:04:56,755] Trial 5 failed with parameters: {'n_estimators': 760, 'learning_rate': 0.06640034400752207, 'max_depth': 8, 'num_leaves': 1300, 'reg_lambda': 30, 'reg_alpha': 75, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 10, 'feature_fraction': 0.2} because of the following error: The value nan is not acceptable.
[W 2024-09-26 12:04:56,756] Trial 5 failed with value nan.
INFO:__main__:Starting trial 9


Results: [nan, nan, 0.019770847879658673, nan, nan, nan, nan, nan, nan, 0.038465737160171376]
Average correlation across all outer folds: nan
Early stopping, best iteration is:
[41]	train's l1: 0.786743	valid's l1: 0.666786
Early stopping, best iteration is:
[1]	train's l1: 0.766692	valid's l1: 0.74104


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[158]	train's l1: 0.331564	valid's l1: 0.723011
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[70]	train's l1: 0.771508	valid's l1: 0.799017
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	train's l1: 0.753815	valid's l1: 0.813404
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.766692	valid's l1: 0.74104


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[1]	train's l1: 0.784068	valid's l1: 0.717139


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[10]	train's l1: 0.704225	valid's l1: 0.757069
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.784068	valid's l1: 0.717139


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.782413	valid's l1: 0.717614
Early stopping, best iteration is:
[1]	train's l1: 0.782413	valid's l1: 0.717614


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[65]	train's l1: 0.500932	valid's l1: 0.709021


[I 2024-09-26 12:06:35,401] Trial 6 finished with value: 0.1695157344920812 and parameters: {'n_estimators': 569, 'learning_rate': 0.291648834908549, 'max_depth': 3, 'num_leaves': 2700, 'reg_lambda': 90, 'reg_alpha': 10, 'bagging_fraction': 0.7, 'bagging_freq': 5, 'feature_fraction': 0.2}. Best is trial 2 with value: 0.1714361555153729.


Results: [0.2109192955435168, 0.1836249873479279, 0.025170425736400937, 0.14994445972926185, 0.22519243217000978, 0.17061624372916342, 0.08827025086998741, 0.17178081390419067, 0.15602160138680687, 0.3136168345035466]
Average correlation across all outer folds: 0.1695157344920812


INFO:__main__:Starting trial 10


Early stopping, best iteration is:
[20]	train's l1: 0.771218	valid's l1: 0.773629
Results: [nan, -0.08903492254911413, 0.1609192835617937, 0.013573517246596521, 0.1159280445991793, 0.009927415680493726, -0.025501835202927096, 0.009567517687456572, nan, 0.09113155561487678]
Average correlation across all outer folds: nan


[W 2024-09-26 12:06:45,415] Trial 7 failed with parameters: {'n_estimators': 534, 'learning_rate': 0.2706861340241903, 'max_depth': 5, 'num_leaves': 520, 'reg_lambda': 55, 'reg_alpha': 100, 'bagging_fraction': 0.7, 'bagging_freq': 4, 'feature_fraction': 0.2} because of the following error: The value nan is not acceptable.
[W 2024-09-26 12:06:45,426] Trial 7 failed with value nan.
INFO:__main__:Starting trial 11


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.779909	valid's l1: 0.72832


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.779909	valid's l1: 0.72832


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.778605	valid's l1: 0.785872


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.771664	valid's l1: 0.799257


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.787401	valid's l1: 0.667295


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[108]	train's l1: 0.585543	valid's l1: 0.747791
Early stopping, best iteration is:
[1]	train's l1: 0.778605	valid's l1: 0.785872
Training until validation scores don't improve for 100 rounds


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.755971	valid's l1: 0.815445


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[38]	train's l1: 0.787391	valid's l1: 0.667289
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.771604	valid's l1: 0.774018


  corr, _ = pearsonr(y_test_outer, y_pred_outer)
[W 2024-09-26 12:08:24,023] Trial 9 failed with parameters: {'n_estimators': 694, 'learning_rate': 0.10700268996906293, 'max_depth': 9, 'num_leaves': 1380, 'reg_lambda': 40, 'reg_alpha': 70, 'bagging_fraction': 0.2, 'bagging_freq': 7, 'feature_fraction': 0.9} because of the following error: The value nan is not acceptable.
[W 2024-09-26 12:08:24,026] Trial 9 failed with value nan.
INFO:__main__:Starting trial 12


Results: [0.09296440785335841, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Average correlation across all outer folds: nan
Early stopping, best iteration is:
[113]	train's l1: 0.575303	valid's l1: 0.709527
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[473]	train's l1: 0.765398	valid's l1: 0.739625
Early stopping, best iteration is:
[1]	train's l1: 0.755971	valid's l1: 0.815445


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[322]	train's l1: 0.75726	valid's l1: 0.789487
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Did not meet early stopping. Best iteration is:
[315]	train's l1: 0.754841	valid's l1: 0.735232
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[256]	train's l1: 0.5214	valid's l1: 0.686796
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[321]	train's l1: 0.75797	valid's l1: 0.704642
Early stopping, best iteration is:
[17]	train's l1: 0.771283	valid's l1: 0.773481
Results: [0.1937511882441927, nan, nan, nan, nan, nan, -0.036712550253615334, nan, nan, 0.08722926766691816]
Average correlation across all outer folds: nan


[W 2024-09-26 12:09:49,821] Trial 8 failed with parameters: {'n_estimators': 679, 'learning_rate': 0.0369080404080608, 'max_depth': 11, 'num_leaves': 1960, 'reg_lambda': 30, 'reg_alpha': 85, 'bagging_fraction': 0.4, 'bagging_freq': 6, 'feature_fraction': 0.30000000000000004} because of the following error: The value nan is not acceptable.
[W 2024-09-26 12:09:49,826] Trial 8 failed with value nan.
INFO:__main__:Starting trial 13


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[300]	train's l1: 0.781265	valid's l1: 0.716473
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[49]	train's l1: 0.66649	valid's l1: 0.68845
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[289]	train's l1: 0.759629	valid's l1: 0.704455
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	train's l1: 0.680598	valid's l1: 0.764848
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	train's l1: 0.781555	valid's l1: 0.717095
Early stopping, best iteration is:
[143]	train's l1: 0.767669	valid's l1: 0.718854
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 round

[I 2024-09-26 12:12:46,738] Trial 12 finished with value: 0.11803940863904958 and parameters: {'n_estimators': 325, 'learning_rate': 0.29484726791890475, 'max_depth': 11, 'num_leaves': 780, 'reg_lambda': 25, 'reg_alpha': 90, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 2 with value: 0.1714361555153729.
INFO:__main__:Starting trial 14


Results: [0.06461222155923535, 0.06530317918713653, 0.041621658472177636, 0.08712728569348871, 0.19010691424427684, 0.2564519359105868, 0.08200935480168417, 0.14591694225846485, 0.09131269151222035, 0.15593190275122445]
Average correlation across all outer folds: 0.11803940863904958
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[78]	train's l1: 0.654019	valid's l1: 0.689546
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[74]	train's l1: 0.771492	valid's l1: 0.798989
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[72]	train's l1: 0.615375	valid's l1: 0.761788
Early stopping, best iteration is:
[69]	train's l1: 0.753411	valid's l1: 0.812662
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.766692	valid's l1: 0.7410

  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[92]	train's l1: 0.65111	valid's l1: 0.752526
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.784068	valid's l1: 0.717139


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[9]	train's l1: 0.741336	valid's l1: 0.766132
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	train's l1: 0.710995	valid's l1: 0.659442
Early stopping, best iteration is:
[1]	train's l1: 0.782413	valid's l1: 0.717614


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	train's l1: 0.770557	valid's l1: 0.773031


[W 2024-09-26 12:15:04,171] Trial 11 failed with parameters: {'n_estimators': 499, 'learning_rate': 0.18047397026766285, 'max_depth': 11, 'num_leaves': 460, 'reg_lambda': 70, 'reg_alpha': 100, 'bagging_fraction': 0.7, 'bagging_freq': 10, 'feature_fraction': 0.30000000000000004} because of the following error: The value nan is not acceptable.
[W 2024-09-26 12:15:04,171] Trial 11 failed with value nan.


Results: [nan, -0.025191855580038395, 0.05422465025967745, 0.00327289262327694, 0.08598645372140945, -0.046035222618752356, 0.035872225804872857, 0.12608791914723552, nan, 0.12314348524361778]
Average correlation across all outer folds: nan
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.779909	valid's l1: 0.72832


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.778605	valid's l1: 0.785872


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Early stopping, best iteration is:
[75]	train's l1: 0.635483	valid's l1: 0.768645
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.787401	valid's l1: 0.667295


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[243]	train's l1: 0.515799	valid's l1: 0.696258


[I 2024-09-26 12:15:52,516] Trial 10 finished with value: 0.1891943373020865 and parameters: {'n_estimators': 475, 'learning_rate': 0.11992938346161178, 'max_depth': 12, 'num_leaves': 1300, 'reg_lambda': 20, 'reg_alpha': 30, 'bagging_fraction': 0.5, 'bagging_freq': 9, 'feature_fraction': 0.30000000000000004}. Best is trial 10 with value: 0.1891943373020865.


Results: [0.22664483569516033, 0.22399165779772662, 0.11334832454877948, 0.14839753713396076, 0.295948730271338, 0.19445442530763604, 0.09357333794254538, 0.1762294922002863, 0.14535545265027208, 0.27399957947316034]
Average correlation across all outer folds: 0.1891943373020865
Early stopping, best iteration is:
[26]	train's l1: 0.704891	valid's l1: 0.76164
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.755971	valid's l1: 0.815445


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	train's l1: 0.770481	valid's l1: 0.771553


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[53]	train's l1: 0.683199	valid's l1: 0.731023


[I 2024-09-26 12:16:31,996] Trial 13 finished with value: 0.19093623705651203 and parameters: {'n_estimators': 970, 'learning_rate': 0.21648811107927657, 'max_depth': 8, 'num_leaves': 2340, 'reg_lambda': 10, 'reg_alpha': 40, 'bagging_fraction': 0.5, 'bagging_freq': 9, 'feature_fraction': 0.6000000000000001}. Best is trial 13 with value: 0.19093623705651203.


Results: [0.24395886727101251, 0.20053764576967664, 0.13816775965693556, 0.12056817361086034, 0.23817623221255418, 0.14713461827405977, 0.15620906118736821, 0.11793592983394259, 0.23105331025778333, 0.31562077249092707]
Average correlation across all outer folds: 0.19093623705651203
Early stopping, best iteration is:
[1]	train's l1: 0.771604	valid's l1: 0.774018


  corr, _ = pearsonr(y_test_outer, y_pred_outer)


Results: [0.09296440785335845, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Average correlation across all outer folds: nan


[W 2024-09-26 12:16:33,010] Trial 14 failed with parameters: {'n_estimators': 455, 'learning_rate': 0.08749284596778963, 'max_depth': 8, 'num_leaves': 2000, 'reg_lambda': 35, 'reg_alpha': 70, 'bagging_fraction': 0.2, 'bagging_freq': 10, 'feature_fraction': 0.9} because of the following error: The value nan is not acceptable.
[W 2024-09-26 12:16:33,011] Trial 14 failed with value nan.
