# Machine Learning Approach using LightGBM

## Prepare Model Input Data

In [None]:
# Load libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

### load input data

# phenotype data
pheno = pd.read_csv('data/AUSPAK_phenotypes_means_BLUEs.csv', index_col=0)
pheno = pheno.rename(columns={'SampleName': 'sample.id'})

# load PCA data
pca = pd.read_csv('quinoa_551_AUSPAK_PCs.csv')

## merge and prepare model input data

# combine PCA and phenotype data
model_input = pd.merge(pca, pheno, on='sample.id')


### ONE-HOT ENCODE CATEGORICAL VARIABLES

# as binary features for ML models
cat_encoder = OneHotEncoder(sparse_output=False)
encoded_features = cat_encoder.fit_transform(model_input[['location', 'environment']])
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=cat_encoder.get_feature_names_out()
)

# Combine original data with encoded features
model_input_final = pd.concat([model_input, encoded_df], axis=1)

# Remove year column (there is nothing linking year across locations, so it is not useful for prediction)
model_input_final = model_input_final.drop(columns=['year'])


### save prepared data

model_input_final.to_pickle('model_inputs/model_input.pkl')

print(f"Model input prepared: {model_input_final.shape[0]} observations, {model_input_final.shape[1]} features")
print(f"Saved to: model_inputs/model_input.pkl")

## Machine Learning Genomic Prediction Functions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score
from lightgbm import LGBMRegressor

### Evaluation metrics
def pearson_corr(y_true, y_pred):
    """Calculate Pearson correlation between true and predicted values"""
    return np.corrcoef(y_true, y_pred)[0, 1]

def calculate_ndcg(y_true, y_pred, k=10, lower_is_better=False):
    """
    Calculate NDCG@k for genomic prediction
    Measures ranking quality by comparing predicted ranking to ideal ranking
    """
    k = min(k, len(y_true)) # handle case where less than k samples
    
    # Flip values if lower is better
    if lower_is_better:
        y_true_adj = -y_true
        y_pred_adj = -y_pred
    else:
        y_true_adj = y_true
        y_pred_adj = y_pred
    
    # Transform to non-negative values for NDCG calculation
    min_val = min(y_true_adj.min(), y_pred_adj.min())
    if min_val < 0:
        y_true_pos = y_true_adj - min_val + 1e-6 
        y_pred_pos = y_pred_adj - min_val + 1e-6
    else:
        y_true_pos = y_true_adj
        y_pred_pos = y_pred_adj
    
    # Reshape for sklearn (expects 2D)
    y_true_2d = y_true_pos.reshape(1, -1)
    y_pred_2d = y_pred_pos.reshape(1, -1)
    
    return ndcg_score(y_true_2d, y_pred_2d, k=k)

def evaluate_predictions(y_true, y_pred, trait_name=None):
    """Comprehensive evaluation including ranking metrics"""
    if len(y_true) < 2:
        return {'pearson': np.nan, 'ndcg_at_10': np.nan}
    
    # Define traits where lower values are better
    lower_is_better_traits = ['DTF_blue', 'DTH_blue', 'PtHt_blue', 'DTF_mean', 'DTH_mean', 'PtHt_mean']
    lower_is_better = trait_name in lower_is_better_traits if trait_name else False
    
    try:
        return {
            'pearson': pearson_corr(y_true, y_pred),
            'ndcg_at_10': calculate_ndcg(y_true, y_pred, k=10, lower_is_better=lower_is_better)
        }
    except Exception as e:
        print(f"    Warning: Error calculating metrics: {str(e)}")
        return {'pearson': np.nan, 'ndcg_at_10': np.nan}

### data preprocessing

def apply_environment_scaling(data: pd.DataFrame, traits: list) -> pd.DataFrame:
    """Apply z-score transformation by environment to all traits upfront"""
    data_scaled = data.copy()
    
    for trait in traits:
        trait_data = data_scaled[data_scaled[trait].notna()].copy()
        if len(trait_data) < 50:
            continue
            
        print(f"Applying z-score transformation by environment for {trait}...")
        
        for env in trait_data['environment'].unique():
            env_mask = (data_scaled['environment'] == env) & (data_scaled[trait].notna())
            
            if env_mask.sum() > 1:  # Need at least 2 points to calculate std
                scaler = StandardScaler()
                values = data_scaled.loc[env_mask, trait].values.reshape(-1, 1)
                data_scaled.loc[env_mask, trait] = scaler.fit_transform(values).flatten()
                print(f"  Environment {env}: n={env_mask.sum()}")
    
    return data_scaled

### cross-validation

def run_location_specific_cv(data: pd.DataFrame, traits: list, model_class=LGBMRegressor,
                           model_params: dict = None, n_iterations: int = 15, 
                           k_folds: int = 5, n_jobs: int = 10, apply_zscore: bool = True) -> pd.DataFrame:
    """
    Run grouped k-fold cross-validation for genomic prediction with location-specific evaluation

    Strategy:
    - Split genotypes (not observations) into folds to prevent data leakage
    - Train on all observations from training genotypes across all environments
    - Predict for test genotypes and evaluate separately by location
    - Repeat with different random seeds for robust estimates
    """
    
    if model_params is None:
        model_params = {'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 500}
    
    # Initialize results dataframe
    cv_results = pd.DataFrame(columns=[
        'iteration', 'fold', 'trait', 'location', 'pearson',
        'ndcg_at_10', 'seed', 'n_observations', 'n_test_genotypes'
    ])
    
    # Define feature columns (PCs + encoded location/env)
    feature_columns = [col for col in data.columns if col.startswith('PC') or 
                      col.startswith('location_') or col.startswith('environment_')]
    
    # Get unique genotypes and locations
    genotypes = data['sample.id'].unique()
    locations = data['location'].unique()
    
    print(f"Starting cross-validation with {len(genotypes)} genotypes")
    print(f"Traits: {', '.join(traits)}")
    print(f"Locations: {', '.join(locations)}")
    print(f"Feature columns: {len(feature_columns)} (PCs + encoded variables)")
    print(f"K-folds: {k_folds} | Iterations: {n_iterations}")
    print(f"Z-score transformation: {'ENABLED' if apply_zscore else 'DISABLED'}")
    print("Evaluation metrics: Pearson, NDCG@10 (higher is better)\n")
    
    # Apply environment scaling upfront if enabled
    if apply_zscore:
        data = apply_environment_scaling(data, traits)

    ### cross validation loop
    # Loop through iterations with different seeds
    for iter_num in range(1, n_iterations + 1):
        current_seed = 1000 + iter_num
        np.random.seed(current_seed)
        
        print(f"Iteration {iter_num} of {n_iterations} (seed: {current_seed})")
        
        # Create genotype folds (GroupKFold ensures genotypes are grouped)
        gkf = GroupKFold(n_splits=k_folds)
        fold_splits = list(gkf.split(data, groups=data['sample.id']))
        
        # Loop through folds
        for fold_idx, (train_idx, test_idx) in enumerate(fold_splits, 1):
            print(f"  Fold {fold_idx} of {k_folds}")
            
            # Get test genotypes for this fold
            test_genotypes = data.iloc[test_idx]['sample.id'].unique()
            
            # Loop through traits
            for trait in traits:
                try:
                    # Check if trait has enough data
                    trait_data = data[data[trait].notna()].copy()
                    if len(trait_data) < 50:
                        print(f"    Warning: Not enough data for trait {trait}")
                        continue
                    
                    # Create training data by masking test genotypes
                    train_data = trait_data[~trait_data['sample.id'].isin(test_genotypes)].copy()
                    
                    # Check if we have enough training data
                    if len(train_data) < 50:
                        print(f"    Warning: Not enough training data for trait {trait} (N={len(train_data)})")
                        continue
                    
                    # Prepare features and target for training
                    X_train = train_data[feature_columns]
                    y_train = train_data[trait]
                    
                    # train model
                    model = model_class(random_state=current_seed, **model_params)
                    model.fit(X_train, y_train)

                    ### location-specific evaluation
                    for location in locations:
                        # Get test data for this location
                        test_data_loc = trait_data[
                            (trait_data['sample.id'].isin(test_genotypes)) & 
                            (trait_data['location'] == location)
                        ].copy()
                        
                        if len(test_data_loc) < 3:
                            continue

                        # Make predictions
                        X_test_loc = test_data_loc[feature_columns]
                        y_pred = model.predict(X_test_loc)
                        test_data_loc['predictions'] = y_pred
                        
                        # Average predictions and observations by genotype
                        # this accounts for multiple observations per genotype within location
                        genotype_averages = test_data_loc.groupby('sample.id').agg({
                            trait: 'mean',
                            'predictions': 'mean'
                        }).reset_index()
                        
                        # Calculate evaluation metrics
                        if len(genotype_averages) >= 3:
                            y_true = genotype_averages[trait].values
                            y_pred_vals = genotype_averages['predictions'].values
                            
                            # get Pearson and NDCG@10
                            eval_results = evaluate_predictions(y_true, y_pred_vals, trait_name=trait)
                            
                            # Store results if valid
                            if not np.isnan(eval_results['pearson']):
                                new_row = pd.DataFrame({
                                    'iteration': [iter_num],
                                    'fold': [fold_idx],
                                    'trait': [trait],
                                    'location': [location],
                                    'pearson': [eval_results['pearson']],
                                    'ndcg_at_10': [eval_results['ndcg_at_10']],
                                    'seed': [current_seed],
                                    'n_test_genotypes': [len(genotype_averages)]
                                })
                                cv_results = pd.concat([cv_results, new_row], ignore_index=True)
                                
                                print(f"    {trait} - {location} - Pearson: {eval_results['pearson']:.3f}, "
                                      f"NDCG@10: {eval_results['ndcg_at_10']:.3f} | N: {len(genotype_averages)}")
                            else:
                                print(f"    {trait} - {location} - Invalid results")
                        else:
                            print(f"    {trait} - {location} - Insufficient data (N={len(genotype_averages)})")
                
                except Exception as e:
                    print(f"    Error in trait {trait}, iteration {iter_num}, fold {fold_idx}: {str(e)}")
                    continue
    
    ### summarize results
    if len(cv_results) > 0:
        metric_cols = ['pearson', 'ndcg_at_10']
        summary_stats = cv_results.groupby(['trait', 'location']).agg({
            **{col: ['mean', 'std', 'min', 'max'] for col in metric_cols},
            'n_test_genotypes': 'mean'
        }).round(4)
        
        # Flatten column names
        summary_stats.columns = [f"{col}_{stat}" if col in metric_cols else col 
                                for col, stat in summary_stats.columns]
        summary_stats = summary_stats.reset_index()
        
        print("\n=== Cross-Validation Summary by Location ===")
        print(summary_stats[['trait', 'location', 'pearson_mean', 'ndcg_at_10_mean']])

        return cv_results, summary_stats
    else:
        print("No successful cross-validation results obtained")
        return pd.DataFrame(), pd.DataFrame()

## execute model training and evaluation

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from file_with_functions import run_location_specific_cv   # import the function from .py file if above code is saved in a .py file
import pandas as pd

# Model parameters
model_params = {
    'max_depth': 3,
    'learning_rate': 0.05,
    'n_estimators': 500
}

# Load model input data
model_input = pd.read_pickle('../model_inputs/model_input.pkl')

# Run cross-validation with specified traits
results, summary = run_location_specific_cv(
    data=model_input,
    traits=['PtHt_mean', 'PcleLng_mean', 'SdW_z_mean', 'TGW_mean', 'SdLen_mean', 'DTF_mean', 'DTH_mean', 'PtHt_blue', 'PcleLng_blue', 'SdW_z_blue', 'TGW_blue', 'SdLen_blue', 'DTF_blue', 'DTH_blue'],
    model_class=LGBMRegressor, 
    model_params=model_params,
    n_iterations=15,
    k_folds=5,
    apply_zscore=True
)

# Save results 
results.to_pickle(f'../data/LightGBM_results.pkl')

print(f"\nResults saved:")
print(f"- Detailed results: ../data/LightGBM_results.pkl")
