In [1]:
import numpy as np
import pandas as pd
import tqdm 
import cProfile
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import optuna
import logging
import time
from scipy.stats import pearsonr

from split import prep_data_before_train, random_split, subset

data = pd.read_feather("data/processed/massBV.feather")

X, y, ringnrs, mean_pheno = prep_data_before_train(data, "mass")
del data
X.drop(columns = ["hatchisland"], inplace = True)
X["ringnr"] = ringnrs   

target = pd.DataFrame(y)
target["mean_pheno"] = mean_pheno
target["ringnr"] = ringnrs

folds = random_split("mass", num_folds=10, seed=42)

X = pd.merge(X,folds, on = "ringnr", how = "inner") 
X = pd.merge(X,target, on = "ringnr", how = "inner")

X = subset(X, num_snps=20000)

First do hyperparameter optimization using Optuna, we will only do one split of the data for this. Then we will train the model on the full data using the best hyperparameters. For this part we will use group kfold cross validation to test on all the folds.

In [2]:
X_hyp = X.drop(columns = ["ringnr", "fold", "ID", "mean_pheno"])
y_hyp = X[["ID", "mean_pheno"]]

X_train, X_val, y_train, y_val = train_test_split(X_hyp, y_hyp, test_size = 0.2, random_state = 42)

In [3]:
def objective(trial):
    params = {
        "objective": "reg:pseudohubererror",
        "n_estimators": 1000,
        "verbosity": 0, 
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 30)  
    }

    model = xgb.XGBRegressor(**params)  
    model.fit(X_train, y_train["ID"])
    preds = model.predict(X_val)
    corr, _ = pearsonr(preds, y_val["mean_pheno"])
    return corr

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 30, n_jobs = 4)

[I 2024-09-29 18:19:57,908] A new study created in memory with name: no-name-82f1675e-60eb-4264-b0a2-c66cfb962fdd
[I 2024-09-29 18:33:21,061] Trial 0 finished with value: 0.28661326378838714 and parameters: {'learning_rate': 0.0026575889484098413, 'max_depth': 4, 'subsample': 0.9758238997316266, 'colsample_bytree': 0.7360547823605588, 'min_child_weight': 5}. Best is trial 0 with value: 0.28661326378838714.
[I 2024-09-29 18:33:40,509] Trial 1 finished with value: 0.19185539204851193 and parameters: {'learning_rate': 0.03556141109065721, 'max_depth': 9, 'subsample': 0.5144214507766243, 'colsample_bytree': 0.9286454663284888, 'min_child_weight': 14}. Best is trial 0 with value: 0.28661326378838714.
[I 2024-09-29 18:33:42,676] Trial 3 finished with value: 0.28076753209965877 and parameters: {'learning_rate': 0.003986621892265073, 'max_depth': 8, 'subsample': 0.8686463607475218, 'colsample_bytree': 0.5183450572339015, 'min_child_weight': 21}. Best is trial 0 with value: 0.28661326378838714.

In [5]:
plot_slice = optuna.visualization.plot_slice(study)
plot_slice.show()

In [6]:
study.best_params

{'learning_rate': 0.0013765521522519826,
 'max_depth': 7,
 'subsample': 0.593876716521791,
 'colsample_bytree': 0.5593209061489248,
 'min_child_weight': 10}

In [8]:
group_kfold = GroupKFold(n_splits=10)
correlation_results = []

for i, (train_index, test_index) in enumerate(group_kfold.split(X, groups=X['fold'])):
    print(f"Fold {i}")
    X_train, X_test = X.drop(columns=['ringnr', 'fold', 'ID', 'mean_pheno']).iloc[train_index], X.drop(columns=['ringnr', 'fold', 'ID', 'mean_pheno']).iloc[test_index]
    y_train, y_test = X[["ID","mean_pheno"]].iloc[train_index], X[["ID","mean_pheno"]].iloc[test_index]
    

    model = xgb.XGBRegressor(
        objective="reg:pseudohubererror",
        n_estimators=1000,
        verbosity=0,
        **study.best_params
    )
    model.fit(X_train, y_train["ID"])
    
    preds = model.predict(X_test)
    corr, _ = pearsonr(preds, y_test["mean_pheno"])
    print(f"Fold {i} correlation: {corr}")
    correlation_results.append(corr)

print(correlation_results)

Fold 0
Fold 0 correlation: 0.4447325169641788
Fold 1
Fold 1 correlation: 0.22528727876288612
Fold 2
Fold 2 correlation: 0.2380093886010157
Fold 3
Fold 3 correlation: 0.19558930593096285
Fold 4
Fold 4 correlation: 0.23161099665906018
Fold 5
Fold 5 correlation: 0.20023949414613387
Fold 6
Fold 6 correlation: 0.3650968079901169
Fold 7
Fold 7 correlation: 0.29604622344118814
Fold 8
Fold 8 correlation: 0.2938050850066555
Fold 9
Fold 9 correlation: 0.24788837659812338
[0.4447325169641788, 0.22528727876288612, 0.2380093886010157, 0.19558930593096285, 0.23161099665906018, 0.20023949414613387, 0.3650968079901169, 0.29604622344118814, 0.2938050850066555, 0.24788837659812338]


In [10]:
np.mean(correlation_results)

0.2738305474100321

In [8]:
5/18

0.2777777777777778