In [1]:
import numpy as np
import pandas as pd
import tqdm 
import cProfile
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import optuna
import logging
import time
from scipy.stats import pearsonr

data = pd.read_feather("data/processed/massBV.feather")

In [2]:
def prep_data_before_train(data: pd.DataFrame, phenotype: str) -> tuple:
    """
    prepare data for training, returns target vector and covariate-matrix and ringnrs for grouping
    :param data: all data from dataloader script
    :param phenotype: the phenotype to be predicted
    :return: X, Y, ringnrs, X contain covariates and Y contains the phenotype
    """
    ringnrs = data.ringnr
    mean_pheno = data.mean_pheno
    X = data.drop(
        columns=[
            "ID",
            "mass",
            "tarsus",
            "ringnr",
            "mean_pheno",
            "IID",
            "FID",
            "MAT",
            "PAT",
            "SEX",
            "PHENOTYPE",
        ],
        errors="ignore",
    )

    try:
        Y = data.loc[:, phenotype]
    except KeyError:
        try:
            Y = data.ID
        except AttributeError:
            Y = data.mass
    del data
    try:
        X.hatchyear = X.hatchyear.astype("int")
        X.island_current = X.island_current.astype("int")
    except AttributeError:
        pass
    Y = (Y - np.mean(Y))/np.std(Y)
    X = X.fillna(0)
    X = X.T
    X = X.astype("int")
    X = X.T 
    return X, Y, ringnrs, mean_pheno

In [3]:
X, y, ringnrs, mean_pheno = prep_data_before_train(data, "mass")
del data
X.drop(columns = ["hatchisland"], inplace = True)
X["ringnr"] = ringnrs   

target = pd.DataFrame(y)
target["mean_pheno"] = mean_pheno
target["ringnr"] = ringnrs

In [4]:
def island_split(pheno:str = "mass") -> pd.DataFrame:
    data = pd.read_feather(f"data/processed/{pheno}BV.feather")  

    table = pd.crosstab(data["hatchisland"],data["hatchisland"])
    diag = np.diag(table)

    # get the indexes of the 8 highest values in the diagonal
    idx = np.argpartition(diag, -8)[-8:]

    #islands with highest number of individuals
    islands = table.index[idx]

    # get the column hatchisland and ringnr for the 8 islands with the highest number of individuals
    x = data.loc[data["hatchisland"].isin(islands),["hatchisland","ringnr"]]
    del data

    #Now we will manually make folds for a 8-fold cross validation based on the hatchisland column
    #The idea is to make 8 folds where each fold contains all individuals from one of the 8 islands with the highest number of individuals.
    #This way we can make sure that the model is trained on all islands and tested on all islands. The size of the folds will be different.

    #We will make a new column in the data frame called "fold" which will contain the fold number for each individual.
    #The fold number will be between 1 and 8.


    #initialize the fold column with zeros
    x["fold"] = 0

    #initialize the fold number
    fold = 1

    #loop over the 8 islands with the highest number of individuals
    for island in islands:
        #get the indexes of the individuals from the current island
        idx = x.loc[x["hatchisland"] == island].index
        #set the fold number for the individuals from the current island
        x.loc[idx,"fold"] = fold
        #increment the fold number
        fold += 1

    return x[["ringnr","fold"]]

splits = island_split()

In [5]:
X = pd.merge(X,splits, on = "ringnr", how = "inner") 
X = pd.merge(X,target, on = "ringnr", how = "inner")  

In [6]:
def subset(X:pd.DataFrame, seed:int = 42, snp_prop:float = 0.2) -> pd.DataFrame:
    """
    Function to subset the data into a smaller dataset for testing purposes, 
    sampling only a subset of the features
    """
    np.random.seed(seed)
    sample_columns = np.random.choice(X.columns[:-4], int(len(X.columns[:-4])*snp_prop), replace = False)
    sample_columns = np.append(sample_columns, ["ringnr","fold", "ID", "mean_pheno"])

    return X[sample_columns]

In [7]:
X = subset(X, snp_prop = 0.1)

In [8]:
X

Unnamed: 0,SNPa352024_T,SNPa51204_C,SNPa371349_T,SNPa86522_T,SNPa395916_T,SNPa302259_A,SNPa524601_A,SNPa483891_T,SNPa207156_T,SNPa77054_G,...,SNPa437479_G,SNPa179086_T,SNPa392716_T,SNPa415615_G,SNPa280074_G,SNPa220371_G,ringnr,fold,ID,mean_pheno
0,0,0,1,2,1,0,0,1,0,0,...,1,1,1,1,0,0,8118424,8,0.519577,32.600000
1,0,0,1,2,0,0,0,1,0,1,...,1,0,1,1,1,0,8118425,8,-1.044296,30.300000
2,0,0,0,1,0,0,0,0,0,1,...,2,1,1,0,0,1,8118426,8,0.918158,33.316667
3,0,0,0,0,0,0,0,1,0,1,...,1,1,1,1,0,0,8118429,8,-0.093239,31.300000
4,0,0,0,2,1,0,0,1,0,0,...,1,2,1,1,0,0,8118430,8,-1.010615,29.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,1,0,8N87368,5,0.523029,32.900000
1881,0,0,0,1,0,0,1,0,1,1,...,0,2,0,0,1,0,8N87369,5,-0.241355,30.900000
1882,0,0,0,0,1,1,0,0,1,2,...,1,1,0,0,0,1,8N87371,5,0.515544,32.500000
1883,0,0,0,1,0,0,0,0,1,1,...,0,1,0,0,0,1,8N87372,5,0.494145,32.800000


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor  # Replace with your model

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assuming your dataframe 'df' has a column 'island' with values 1 to 8 representing the islands
# and a 'fold' column representing the 8 folds, and a 'target' column for the target variable

# Function to perform the nested cross-validation
def nested_cv(df):
    
    outer_results = []
    
    folds = np.sort(df['fold'].unique())    
    for outer_fold in folds:
        print(f"Outer CV: Test island {outer_fold}")
        
        # Split data into outer train and test sets
        outer_train_df = df[df['fold'] != outer_fold]
        outer_test_df = df[df['fold'] == outer_fold]
        
        # Define the Optuna objective function
        def objective(trial):
            logger.info(f"Starting trial {trial.number}")

            # Define hyperparameter search space
            n_estimators = trial.suggest_int('n_estimators', 300, 1000)
            learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
            max_depth = trial.suggest_int('max_depth', 3, 13)
            num_leaves = trial.suggest_int('num_leaves', 20, 3000, step = 20)
            reg_lambda = trial.suggest_int('reg_lambda', 0, 100, step = 5)
            reg_alpha = trial.suggest_int('reg_alpha', 0 , 100, step = 5)
            bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9, step = 0.1)
            bagging_freq = trial.suggest_int('bagging_freq', 1, 10, step = 1)
            feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9, step = 0.1)

            #parameters
            params = {
                'objective': 'regression',
                'metric': 'mae',    
                #'n_estimators': n_estimators,
                'learning_rate': learning_rate,
                'max_depth': max_depth,
                'num_leaves': num_leaves,
                'reg_lambda': reg_lambda,
                'reg_alpha': reg_alpha,
                'bagging_fraction': bagging_fraction,
                'bagging_freq': bagging_freq,
                'feature_fraction': feature_fraction,
                'verbose': -1
            }

            # Perform 7-fold CV on the 7 remaining islands
            inner_corr = []
            start_time = time.time()
            for inner_fold in folds:
                if inner_fold == outer_fold:
                    continue  # Skip the current test island
                
                # Split inner train/validation data
                train_df = outer_train_df[outer_train_df['fold'] != inner_fold]
                val_df = outer_train_df[outer_train_df['fold'] == inner_fold]
                
                # Extract features and target
                X_train, y_train = train_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), train_df['ID']
                X_val, y_val = val_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), val_df['ID'] 

                # Train model
                dtrain = lgb.Dataset(X_train, label=y_train)
                dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

                gbm = lgb.train(params, dtrain, num_boost_round=n_estimators, valid_sets=[dval], callbacks=[lgb.early_stopping(stopping_rounds=20)])
                # Validate model
                y_pred = gbm.predict(X_val)
                corr, _ = pearsonr(y_val, y_pred)
                inner_corr.append(corr)
                
                end_time = time.time()

            logger.info(f"Finished trial {trial.number} in {end_time - start_time:.2f} seconds")
            # Return the average MAE for this set of hyperparameters
            return np.mean(inner_corr)
        
        # Run Optuna for hyperparameter optimization
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, n_jobs=4)
        
        # Train the final model on the outer train set with the best hyperparameters
        best_params = study.best_params

        n_estimators = best_params['n_estimators']
        best_params.pop('n_estimators')
        best_params['objective'] = 'regression'
        best_params['metric'] = 'mae'
        best_params['verbose'] = -1
        
        # Extract features and target for outer training and test sets
        X_train_outer, y_train_outer = outer_train_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), outer_train_df['ID']
        X_test_outer, y_test_outer = outer_test_df.drop(columns=['ringnr', 'fold','ID','mean_pheno']), outer_test_df['mean_pheno']
        
        # Train on the outer train set
        dtrain = lgb.Dataset(X_train_outer, label=y_train_outer)  
        gbm = lgb.train(best_params, dtrain, num_boost_round=n_estimators)
        
        # Test on the outer test set
        y_pred_outer = gbm.predict(X_test_outer)
        outer_corr = np.corrcoef(y_test_outer, y_pred_outer)[0, 1]
        
        print(f"Test correlation for island {outer_fold}: {outer_corr}")
        outer_results.append(outer_corr)
    
    # Final results
    print(f"Average correlation across all outer folds: {np.mean(outer_results)}")
    return outer_results

# Example dataframe with 'island', 'fold', 'target', and other features
# df = pd.DataFrame(...) 

# Run the nested cross-validation
results = nested_cv(X)


[I 2024-09-25 16:02:19,076] A new study created in memory with name: no-name-3a0c19d1-5c6f-4cae-bd8c-d4495e2ee272
INFO:__main__:Starting trial 0
INFO:__main__:Starting trial 1
INFO:__main__:Starting trial 2
INFO:__main__:Starting trial 3


Outer CV: Test island 1
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[16]	valid_0's l1: 30.918
Early stopping, best iteration is:
[26]	valid_0's l1: 30.9004
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 30.9132
Early stopping, best iteration is:
[22]	valid_0's l1: 30.9034
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[38]	valid_0's l1: 32.2624
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 32.2789
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[12]	valid_0's l1: 32.2767
Early stopping, best iteration is:
[39]	valid_0's l1: 32.

INFO:__main__:Finished trial 1 in 100.95 seconds
[I 2024-09-25 16:04:00,277] Trial 1 finished with value: 0.0263793986742906 and parameters: {'n_estimators': 871, 'learning_rate': 0.08151246561117508, 'max_depth': 5, 'num_leaves': 1580, 'reg_lambda': 45, 'reg_alpha': 40, 'bagging_fraction': 0.2, 'bagging_freq': 8, 'feature_fraction': 0.7}. Best is trial 1 with value: 0.0263793986742906.
INFO:__main__:Starting trial 4


Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 31.5264
Early stopping, best iteration is:
[1]	valid_0's l1: 31.5269
Early stopping, best iteration is:
[81]	valid_0's l1: 32.0591


INFO:__main__:Finished trial 0 in 107.24 seconds
[I 2024-09-25 16:04:06,499] Trial 0 finished with value: 0.021597741964949117 and parameters: {'n_estimators': 628, 'learning_rate': 0.12880717336396239, 'max_depth': 7, 'num_leaves': 2700, 'reg_lambda': 100, 'reg_alpha': 0, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 1 with value: 0.0263793986742906.
INFO:__main__:Starting trial 5


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[6]	valid_0's l1: 30.9153
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 32.1473


  c /= stddev[:, None]
  c /= stddev[None, :]
INFO:__main__:Finished trial 3 in 119.60 seconds
[W 2024-09-25 16:04:19,118] Trial 3 failed with parameters: {'n_estimators': 699, 'learning_rate': 0.23238097139525118, 'max_depth': 4, 'num_leaves': 2440, 'reg_lambda': 45, 'reg_alpha': 55, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 6, 'feature_fraction': 0.2} because of the following error: The value nan is not acceptable.
[W 2024-09-25 16:04:19,119] Trial 3 failed with value nan.
INFO:__main__:Starting trial 6


Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[7]	valid_0's l1: 30.9151
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[63]	valid_0's l1: 32.1336


INFO:__main__:Finished trial 2 in 129.07 seconds
[I 2024-09-25 16:04:28,505] Trial 2 finished with value: -0.01797896137977426 and parameters: {'n_estimators': 385, 'learning_rate': 0.04330199180737844, 'max_depth': 10, 'num_leaves': 240, 'reg_lambda': 85, 'reg_alpha': 65, 'bagging_fraction': 0.4, 'bagging_freq': 3, 'feature_fraction': 0.6000000000000001}. Best is trial 1 with value: 0.0263793986742906.
INFO:__main__:Starting trial 7


Early stopping, best iteration is:
[1]	valid_0's l1: 32.2826
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[14]	valid_0's l1: 30.8894
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[11]	valid_0's l1: 32.2766
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[20]	valid_0's l1: 30.9007
Early stopping, best iteration is:
[52]	valid_0's l1: 31.7894
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[16]	valid_0's l1: 32.2786
Early stopping, best iteration is:
[15]	valid_0's l1: 31.7876
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[3]	valid_0's l1: 32.2738
Training until val

  c /= stddev[:, None]
  c /= stddev[None, :]


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[18]	valid_0's l1: 32.1288


INFO:__main__:Finished trial 5 in 135.53 seconds
[I 2024-09-25 16:06:22,146] Trial 5 finished with value: 0.033521431454540444 and parameters: {'n_estimators': 686, 'learning_rate': 0.039304553728825055, 'max_depth': 10, 'num_leaves': 1340, 'reg_lambda': 35, 'reg_alpha': 55, 'bagging_fraction': 0.8, 'bagging_freq': 3, 'feature_fraction': 0.30000000000000004}. Best is trial 5 with value: 0.033521431454540444.
INFO:__main__:Starting trial 8


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 31.5261
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[519]	valid_0's l1: 31.3003
Early stopping, best iteration is:
[1]	valid_0's l1: 32.1473


  c /= stddev[:, None]
  c /= stddev[None, :]
INFO:__main__:Finished trial 6 in 130.31 seconds
[W 2024-09-25 16:06:29,688] Trial 6 failed with parameters: {'n_estimators': 942, 'learning_rate': 0.15584878330557325, 'max_depth': 8, 'num_leaves': 1660, 'reg_lambda': 5, 'reg_alpha': 85, 'bagging_fraction': 0.5, 'bagging_freq': 9, 'feature_fraction': 0.6000000000000001} because of the following error: The value nan is not acceptable.
[W 2024-09-25 16:06:29,688] Trial 6 failed with value nan.
INFO:__main__:Starting trial 9


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's l1: 30.9085
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's l1: 32.1321


INFO:__main__:Finished trial 7 in 141.81 seconds
[I 2024-09-25 16:06:50,519] Trial 7 finished with value: 0.03703586978230163 and parameters: {'n_estimators': 538, 'learning_rate': 0.20993867449441478, 'max_depth': 8, 'num_leaves': 2940, 'reg_lambda': 85, 'reg_alpha': 70, 'bagging_fraction': 0.8, 'bagging_freq': 7, 'feature_fraction': 0.30000000000000004}. Best is trial 7 with value: 0.03703586978230163.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 31.527
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[16]	valid_0's l1: 30.9056
Early stopping, best iteration is:
[3]	valid_0's l1: 32.2738
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[19]	valid_0's l1: 32.1325


INFO:__main__:Finished trial 4 in 190.90 seconds
[I 2024-09-25 16:07:11,333] Trial 4 finished with value: 0.06602702429521179 and parameters: {'n_estimators': 561, 'learning_rate': 0.020008315073835763, 'max_depth': 13, 'num_leaves': 1500, 'reg_lambda': 55, 'reg_alpha': 30, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 4 with value: 0.06602702429521179.


Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's l1: 32.2721
Early stopping, best iteration is:
[12]	valid_0's l1: 31.7885
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[20]	valid_0's l1: 31.7698
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[2]	valid_0's l1: 31.7124
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[32]	valid_0's l1: 31.7022
Early stopping, best iteration is:
[44]	valid_0's l1: 31.3292
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[6]	valid_0's l1: 31.5217
Early stopping, best iteration is:
[116]	valid_0's l1: 31.3018
Training until val

INFO:__main__:Finished trial 8 in 109.32 seconds
[I 2024-09-25 16:08:11,576] Trial 8 finished with value: 0.07203503530506836 and parameters: {'n_estimators': 676, 'learning_rate': 0.20272035567927704, 'max_depth': 12, 'num_leaves': 780, 'reg_lambda': 45, 'reg_alpha': 65, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 8 with value: 0.07203503530506836.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[2]	valid_0's l1: 31.5311
Training until validation scores don't improve for 20 rounds


INFO:__main__:Finished trial 9 in 111.59 seconds


Early stopping, best iteration is:
[2]	valid_0's l1: 32.1334


[I 2024-09-25 16:08:21,408] Trial 9 finished with value: 0.06770483463815472 and parameters: {'n_estimators': 619, 'learning_rate': 0.08704589653258768, 'max_depth': 5, 'num_leaves': 1280, 'reg_lambda': 75, 'reg_alpha': 40, 'bagging_fraction': 0.5, 'bagging_freq': 4, 'feature_fraction': 0.6000000000000001}. Best is trial 8 with value: 0.07203503530506836.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 31.7575


[I 2024-09-25 16:08:29,547] A new study created in memory with name: no-name-278db0be-799b-4d5f-81c6-2d41a65199dc


Test correlation for island 1: -0.004547059821239054
Outer CV: Test island 2


INFO:__main__:Starting trial 0
INFO:__main__:Starting trial 1
INFO:__main__:Starting trial 2
INFO:__main__:Starting trial 3


Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds


In [12]:


# Define objective function for Optuna
def objective(trial):
    
    logger.info(f"Starting trial {trial.number}")

    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 300, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    max_depth = trial.suggest_int('max_depth', 3, 13)
    num_leaves = trial.suggest_int('num_leaves', 20, 3000, step = 20)
    reg_lambda = trial.suggest_int('reg_lambda', 0, 100, step = 5)
    reg_alpha = trial.suggest_int('reg_alpha', 0 , 100, step = 5)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9, step = 0.1)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 10, step = 1)
    feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9, step = 0.1)

    
    #parameters
    params = {
        'objective': 'regression',
        'metric': 'mse',    
        #'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'num_leaves': num_leaves,
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'feature_fraction': feature_fraction,
        'verbose': -1
    }
    
    # Cross-validation setup
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = [] # Store the mean squared error for each fold

    start_time = time.time()
    # Evaluate with cross-validation
    for train_idx, val_idx in kfold.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train model
        dtrain = lgb.Dataset(X_train_fold, label=y_train_fold)
        dval = lgb.Dataset(X_val_fold, label=y_val_fold, reference=dtrain)
        
        gbm = lgb.train(params, dtrain, num_boost_round=n_estimators, valid_sets=[dval], callbacks=[lgb.early_stopping(stopping_rounds=20)])
        
        # Predict on validation set
        y_pred = gbm.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, y_pred)
        mse_scores.append(mse)

        mean_mse = np.mean(mse_scores)
        end_time = time.time()
        logger.info(f"Finished trial {trial.number} in {end_time - start_time:.2f} seconds")
        return mean_mse




if __name__ == '__main__':

    # Create Optuna study
    study = optuna.create_study(direction='minimize')

    n_jobs = 4
    
    study.optimize(objective, n_trials=10, n_jobs=n_jobs)

    # Best hyperparameters
    print('Best hyperparameters:', study.best_params)

[I 2024-09-12 08:40:22,058] A new study created in memory with name: no-name-057c49f5-a05c-4c3c-ad81-9a6011c6248b
INFO:__main__:Starting trial 0
INFO:__main__:Starting trial 1
INFO:__main__:Starting trial 2
INFO:__main__:Starting trial 3


Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[26]	valid_0's l2: 0.930644


INFO:__main__:Finished trial 3 in 11.49 seconds
[I 2024-09-12 08:40:33,733] Trial 3 finished with value: 0.9306442219195172 and parameters: {'n_estimators': 696, 'learning_rate': 0.16172110694705055, 'max_depth': 7, 'num_leaves': 2240, 'reg_lambda': 65, 'reg_alpha': 40, 'bagging_fraction': 0.8, 'bagging_freq': 10, 'feature_fraction': 0.9}. Best is trial 3 with value: 0.9306442219195172.
INFO:__main__:Starting trial 4


Early stopping, best iteration is:
[55]	valid_0's l2: 0.907236


INFO:__main__:Finished trial 0 in 16.63 seconds
[I 2024-09-12 08:40:38,862] Trial 0 finished with value: 0.9072361723012389 and parameters: {'n_estimators': 703, 'learning_rate': 0.20497929861378913, 'max_depth': 4, 'num_leaves': 1840, 'reg_lambda': 30, 'reg_alpha': 45, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 8, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.9072361723012389.
INFO:__main__:Starting trial 5


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[81]	valid_0's l2: 0.908149


INFO:__main__:Finished trial 1 in 22.34 seconds
[I 2024-09-12 08:40:44,583] Trial 1 finished with value: 0.908149447512883 and parameters: {'n_estimators': 584, 'learning_rate': 0.03715123826964573, 'max_depth': 8, 'num_leaves': 2440, 'reg_lambda': 20, 'reg_alpha': 15, 'bagging_fraction': 0.9, 'bagging_freq': 8, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.9072361723012389.
INFO:__main__:Starting trial 6


Early stopping, best iteration is:
[6]	valid_0's l2: 0.931097


INFO:__main__:Finished trial 4 in 11.70 seconds
[I 2024-09-12 08:40:45,500] Trial 4 finished with value: 0.9310974113108552 and parameters: {'n_estimators': 563, 'learning_rate': 0.21669255927439596, 'max_depth': 4, 'num_leaves': 520, 'reg_lambda': 45, 'reg_alpha': 0, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 2, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.9072361723012389.
INFO:__main__:Starting trial 7


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[156]	valid_0's l2: 0.954593


INFO:__main__:Finished trial 2 in 31.01 seconds
[I 2024-09-12 08:40:53,297] Trial 2 finished with value: 0.9549118418941217 and parameters: {'n_estimators': 977, 'learning_rate': 0.03577702408861337, 'max_depth': 4, 'num_leaves': 1000, 'reg_lambda': 0, 'reg_alpha': 70, 'bagging_fraction': 0.7, 'bagging_freq': 5, 'feature_fraction': 0.5}. Best is trial 0 with value: 0.9072361723012389.
INFO:__main__:Starting trial 8


Training until validation scores don't improve for 20 rounds
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[44]	valid_0's l2: 0.951257


INFO:__main__:Finished trial 7 in 11.70 seconds
[I 2024-09-12 08:40:57,311] Trial 7 finished with value: 0.9515447543380327 and parameters: {'n_estimators': 470, 'learning_rate': 0.25767811485427683, 'max_depth': 12, 'num_leaves': 160, 'reg_lambda': 55, 'reg_alpha': 55, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 5, 'feature_fraction': 0.7}. Best is trial 0 with value: 0.9072361723012389.
INFO:__main__:Starting trial 9


Early stopping, best iteration is:
[14]	valid_0's l2: 0.941653


INFO:__main__:Finished trial 6 in 13.23 seconds
[I 2024-09-12 08:40:57,898] Trial 6 finished with value: 0.9416530360874679 and parameters: {'n_estimators': 795, 'learning_rate': 0.2939669271797528, 'max_depth': 9, 'num_leaves': 1860, 'reg_lambda': 75, 'reg_alpha': 35, 'bagging_fraction': 0.8, 'bagging_freq': 7, 'feature_fraction': 0.9}. Best is trial 0 with value: 0.9072361723012389.


Early stopping, best iteration is:
[46]	valid_0's l2: 0.891645


INFO:__main__:Finished trial 5 in 21.35 seconds
[I 2024-09-12 08:41:00,308] Trial 5 finished with value: 0.8916446573487868 and parameters: {'n_estimators': 493, 'learning_rate': 0.19900495462488169, 'max_depth': 13, 'num_leaves': 2440, 'reg_lambda': 80, 'reg_alpha': 35, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 0.4}. Best is trial 5 with value: 0.8916446573487868.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[19]	valid_0's l2: 0.932187


INFO:__main__:Finished trial 8 in 9.26 seconds
[I 2024-09-12 08:41:02,694] Trial 8 finished with value: 0.9321866002080017 and parameters: {'n_estimators': 591, 'learning_rate': 0.19778543649602615, 'max_depth': 8, 'num_leaves': 2660, 'reg_lambda': 35, 'reg_alpha': 20, 'bagging_fraction': 0.2, 'bagging_freq': 3, 'feature_fraction': 0.7}. Best is trial 5 with value: 0.8916446573487868.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[28]	valid_0's l2: 0.95802


INFO:__main__:Finished trial 9 in 10.14 seconds
[I 2024-09-12 08:41:07,557] Trial 9 finished with value: 0.9579288303155713 and parameters: {'n_estimators': 743, 'learning_rate': 0.29287839443495634, 'max_depth': 9, 'num_leaves': 720, 'reg_lambda': 45, 'reg_alpha': 85, 'bagging_fraction': 0.9, 'bagging_freq': 10, 'feature_fraction': 0.2}. Best is trial 5 with value: 0.8916446573487868.


Best hyperparameters: {'n_estimators': 493, 'learning_rate': 0.19900495462488169, 'max_depth': 13, 'num_leaves': 2440, 'reg_lambda': 80, 'reg_alpha': 35, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 0.4}


In [8]:
plot_slice = optuna.visualization.plot_slice(study)
plot_slice.show()

In [13]:


print(f"Best params: {best_params}")

# Train model with best hyperparameters
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_test, label=y_test, reference=dtrain)

gbm = lgb.train(best_params, dtrain, num_boost_round=n_estimators, valid_sets=[dval], callbacks=[lgb.early_stopping(stopping_rounds=20)])

# Predict on test set
y_pred = gbm.predict(X_test)

# Calculate the pearson correlation between the predicted values and the maen phenotype of the test set
correlation = np.corrcoef(y_pred, mean_pheno_test)[0,1]



Best params: {'learning_rate': 0.19900495462488169, 'max_depth': 13, 'num_leaves': 2440, 'reg_lambda': 80, 'reg_alpha': 35, 'bagging_fraction': 0.9, 'bagging_freq': 5, 'feature_fraction': 0.4, 'objective': 'regression', 'metric': 'mse', 'verbose': -1}
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[77]	valid_0's l2: 0.927379


In [14]:
np.corrcoef(y_pred, mean_pheno_test)

array([[1.        , 0.23637552],
       [0.23637552, 1.        ]])

In [None]:
core