# BLUEs estimation for all numerical traits

The two locations were analysed separately

# Australia

In [1]:
# Load libraries
library(dplyr)    # Data manipulation
library(asreml)   # Mixed model analysis


# Display options for output
options(repr.matrix.max.rows = 1000, repr.matrix.max.cols = 200)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: Matrix



Online License checked out Sun Oct 26 11:48:54 2025


Loading ASReml-R version 4.2




In [2]:
# read csv 

aus <- read.csv("../data/AUS_phenotypes_raw.csv", header = TRUE,na.strings = c("","NA"),stringsAsFactors = FALSE)

# as factors year, accession

aus$year <- as.factor(aus$year)
aus$accession <- as.factor(aus$accession)

## Functions

In [3]:
# Function to check model convergence and parameter stability
check_model_status <- function(model) {
  converged <- model$converge
  stable_parameters <- TRUE
  
  if (!is.null(model$warn.list)) {
    for (warn in model$warn.list) {
      if (grepl("changed by more than", warn, fixed = TRUE)) {
        stable_parameters <- FALSE
        break
      }
    }
  }
  
  return(list(converged = converged, stable = stable_parameters))
}

# Function to check normality of residuals because Shapiro-Wilk is too sensitive for large datasets
check_residual_normality <- function(residuals, qq_correlation_threshold = 0.95) {
  if (sum(!is.na(residuals)) < 3) {
    return(list(
      correlation = NA_real_,
      severe_deviation = TRUE,
      interpretation = "Insufficient non-NA residuals for normality check"
    ))
  }
  
  residuals <- residuals[!is.na(residuals)]
  
  tryCatch({
    qq_data <- qqnorm(residuals, plot = FALSE)
    qq_correlation <- cor(qq_data$x, qq_data$y, use = "complete.obs")
    
    if (is.na(qq_correlation)) {
      return(list(
        correlation = NA_real_,
        severe_deviation = TRUE,
        interpretation = "Unable to calculate Q-Q correlation"
      ))
    }
    
    interpretation <- if (qq_correlation >= 0.98) {
      "Excellent normality"
    } else if (qq_correlation >= 0.95) {
      "Acceptable normality"
    } else if (qq_correlation >= 0.90) {
      "Moderate deviation from normality"
    } else {
      "Substantial deviation from normality"
    }
    
    return(list(
      correlation = qq_correlation,
      severe_deviation = qq_correlation < qq_correlation_threshold,
      interpretation = interpretation
    ))
    
  }, error = function(e) {
    return(list(
      correlation = NA_real_,
      severe_deviation = TRUE,
      interpretation = paste("Error in normality check:", e$message)
    ))
  })
}

# Function to calculate generalized heritability
gen_heritability <- function(model){
  vc_m2 <- summary(model)$varcomp
  hv <- which(row.names(vc_m2)=='accession')
  vv <- vc_m2[hv,1]
  hh <- coefficients(model)$random
  hh <- data.frame(hh)
  hh$std.error <- sqrt(model$sigma2*model$vcoeff$random)
  hh <- hh[grep("accession_*", dimnames(hh)[[1]]),]
  heritability <- 1 - mean(hh[,'std.error'])^2/vv
  return(heritability)
}

## Analyze traits for AUS

In [None]:
# Function to analyze multiple traits across years using a single model
analyze_traits_combined_aus <- function(data, traits, output_dir = "results", 
                                  max_iterations = 20, qq_correlation_threshold = 0.95) {

  # Print data summary
  message("Data summary before analysis:")
  message(sprintf("Total rows: %d", nrow(data)))
  message("Rows per year:")
  print(table(data$year))
  
  # Create output directories
  plot_dir <- file.path(output_dir, "diagnostic_plots")
  dir.create(plot_dir, recursive = TRUE, showWarnings = FALSE)
  
  # Initialize results lists
  all_results <- list()
  heritability_results <- data.frame(
    Trait = character(),
    Heritability = numeric(),
    Model_Converged = logical(),
    stringsAsFactors = FALSE
  )
  
  # Process each trait
  for (trait in traits) {
    tryCatch({
      # Prepare data for modeling
      message(sprintf("Processing trait: %s", trait))
      model_data <- data %>%
        select(all_of(c("accession", "year", trait))) %>%
        # Remove rows with NA in the trait column
        filter(!is.na(!!sym(trait)))
       # Skip if no data
      if(nrow(model_data) == 0) {
        message(sprintf("No data available for trait: %s. Skipping.", trait))
        next
      }
      # Store observed year:accession combinations for filtering predictions
      observed_combinations <- model_data %>%
        select(year, accession) %>%
        distinct()
      
      message(sprintf("Number of observed year:accession combinations: %d", nrow(observed_combinations)))
      
      # Rename trait column to 'response' for generic modeling
      names(model_data)[names(model_data) == trait] <- "response"
      
      # Fit model for BLUEs estimation (year and accession as fixed effects)
      model_blues <- asreml(
        fixed = response ~ accession + year,
        residual = ~ units,
        data = model_data,
        na.action = na.method(y = "exclude", x = "exclude"),
        trace = FALSE,
        workspace = "1gb"
      )
      
      # Check convergence and update if needed for BLUEs model
      iteration <- 1
      status_blues <- check_model_status(model_blues)
      
      while (iteration <= max_iterations && (!status_blues$converged || !status_blues$stable)) {
        model_blues <- update(model_blues)
        status_blues <- check_model_status(model_blues)
        iteration <- iteration + 1
      }
      
      # Fit separate model for heritability calculation (accession as random effect)
      model_h2 <- asreml(
        fixed = response ~ year,
        random = ~ accession,
        residual = ~ units,
        data = model_data,
        na.action = na.method(y = "exclude", x = "exclude"),
        trace = FALSE,
        workspace = "1gb"
      )
      
      # Check convergence and update if needed for heritability model
      iteration_h2 <- 1
      status_h2 <- check_model_status(model_h2)
      
      while (iteration_h2 <= max_iterations && (!status_h2$converged || !status_h2$stable)) {
        model_h2 <- update(model_h2)
        status_h2 <- check_model_status(model_h2)
        iteration_h2 <- iteration_h2 + 1
      }
      
      # Calculate heritability using the random effects model
      h2 <- tryCatch({
        gen_heritability(model_h2)
      }, error = function(e) {
        message(sprintf("Error calculating heritability for trait %s: %s", 
                      trait, e$message))
        return(NA)
      })
      
      # Add heritability to results
      heritability_results <- rbind(heritability_results, 
                                  data.frame(Trait = trait,
                                           Heritability = h2,
                                           Model_Converged = status_h2$converged && status_h2$stable))
      
      # Extract residuals and check normality (using BLUEs model)
      residuals <- resid(model_blues)
      normality_check <- check_residual_normality(residuals, qq_correlation_threshold)
      
      # Use classify approach instead of newdata for predictions
      pred <- predict(model_blues, 
                     classify = "year:accession", 
                     vcov = TRUE,
                     sed = TRUE,
                     pworkspace = 64e6)
      
      # Extract prediction dataframe
      pred_df <- pred$pvals
      
      # Check if predictions are available
      if(is.null(pred_df) || nrow(pred_df) == 0) {
        message(sprintf("No predictions available for trait: %s. Skipping.", trait))
        next
      }
      
      message(sprintf("Total predictions generated: %d", nrow(pred_df)))
      
      # FILTER PREDICTIONS TO ONLY OBSERVED COMBINATIONS
      pred_df_filtered <- pred_df %>%
        inner_join(observed_combinations, by = c("year", "accession"))
      
      message(sprintf("Predictions after filtering to observed combinations: %d", nrow(pred_df_filtered)))
      
      
      # Create results dataframe using filtered predictions
      blues_df <- data.frame(
        trait = trait,
        year = pred_df_filtered$year,
        accession = pred_df_filtered$accession,
        BLUE = pred_df_filtered$predicted.value,
        SE = pred_df_filtered$std.error,
        Convergence_Status = ifelse(status_blues$converged && status_blues$stable, 
                                  "Fully converged", "Check convergence"),
        Iterations = iteration - 1,
        QQ_Correlation = normality_check$correlation,
        Normality_Status = normality_check$interpretation
      )
      
      # Generate diagnostic plots
      pdf(file.path(plot_dir, paste0(trait, "_diagnostics.pdf")))
      par(mfrow = c(2, 2))
      
      # Residuals vs Fitted
      plot(fitted(model_blues), residuals,
           main = paste0(trait, "\nResiduals vs Fitted"),
           xlab = "Fitted values", ylab = "Residuals")
      abline(h = 0, col = "red", lty = 2)
      
      # Q-Q plot
      qqnorm(residuals, 
             main = paste0("Normal Q-Q Plot\nCorrelation: ", 
                         round(normality_check$correlation, 3)))
      qqline(residuals, col = "red")
      
      # Histogram of residuals
      hist(residuals, 
           main = paste0("Histogram of residuals\n", 
                        normality_check$interpretation),
           breaks = 30)
      
      # Box plot by year
      boxplot(response ~ year, data = model_data,
              main = "Distribution by Year",
              xlab = "Year", ylab = trait)
      
      dev.off()
      
      # Store results
      all_results[[trait]] <- blues_df
      
    }, error = function(e) {
      message(sprintf("Error in trait %s: %s", trait, e$message))
      all_results[[trait]] <- NULL
    })
  }
  
  # Combine all results into a single dataframe (if any)
  if(length(all_results) > 0) {
    final_results <- do.call(rbind, all_results)
    
    # Write results to CSV files
    write.csv(final_results, 
              file = file.path(output_dir, "trait_BLUEs_results.csv"), 
              row.names = FALSE)
  } else {
    final_results <- NULL
    message("No results were generated for any traits.")
  }
  
  write.csv(heritability_results,
            file = file.path(output_dir, "trait_heritability_results.csv"),
            row.names = FALSE)
  
  return(list(
    blues = final_results,
    heritability = heritability_results
  ))
}

In [5]:
colnames(aus)

In [None]:
results_aus <- analyze_traits_combined_aus(
  data = aus,
  traits = c('DTF', 'DTH', 'PtHt', 'PcleLng', 'SdLen', 'TGW', 'SdWpPlot_z'),
  output_dir = './BLUEs_results_AUS'
)

Data summary before analysis:

Total rows: 2441

Rows per year:




2017 2018 2019 
 953 1051  437 


Processing trait: DTF

Number of observed year:accession combinations: 1138

Total predictions generated: 1668

Predictions after filtering to observed combinations: 1138

Processing trait: DTH

Number of observed year:accession combinations: 1121

Total predictions generated: 1650

Predictions after filtering to observed combinations: 1121

Processing trait: PtHt

Number of observed year:accession combinations: 1908

Total predictions generated: 2868

Predictions after filtering to observed combinations: 1908

Processing trait: PcleLng

Number of observed year:accession combinations: 1136

Total predictions generated: 1664

Predictions after filtering to observed combinations: 1136

Processing trait: SdLen

Number of observed year:accession combinations: 1104

Total predictions generated: 1632

Predictions after filtering to observed combinations: 1104

Processing trait: TGW

Number of observed year:accession combinations: 1101

Total predictions generated: 1630

Predictions after fil

## Pakistan

In [7]:
# read csv 

pak <- read.csv("../data/PAK_phenotypes_raw.csv", header = TRUE,na.strings = c("","NA"),stringsAsFactors = FALSE)
# as factors 
pak$accession <- as.factor(pak$accession)
pak$trial_replicate <- as.factor(pak$trial_replicate)
pak$year <- as.factor(pak$year)

In [10]:
# same as AUS, but adding year:trial_replicate in the model

# Function to analyze multiple traits across years using a single model
analyze_traits_combined_pak <- function(data, traits, output_dir = "results", 
                                  max_iterations = 20, qq_correlation_threshold = 0.95) {
  
  # Print data summary
  message("Data summary before analysis:")
  message(sprintf("Total rows: %d", nrow(data)))
  message("Rows per year:")
  print(table(data$year))
  
  # Create output directories
  plot_dir <- file.path(output_dir, "diagnostic_plots")
  dir.create(plot_dir, recursive = TRUE, showWarnings = FALSE)
  
  # Initialize results lists
  all_results <- list()
  heritability_results <- data.frame(
    Trait = character(),
    Heritability = numeric(),
    Model_Converged = logical(),
    stringsAsFactors = FALSE
  )
  
  # Process each trait
  for (trait in traits) {
    tryCatch({
      # Prepare data for modeling
      message(sprintf("Processing trait: %s", trait))
      model_data <- data %>%
        select(all_of(c("accession", "year", "trial_replicate", trait))) %>%
        # Remove rows with NA in the trait column
        filter(!is.na(!!sym(trait)))
      
      # Skip if no data
      if(nrow(model_data) == 0) {
        message(sprintf("No data available for trait: %s. Skipping.", trait))
        next
      }
      # Store observed year:accession combinations for filtering predictions
      observed_combinations <- model_data %>%
        select(year, accession) %>%
        distinct()
      
      message(sprintf("Number of observed year:accession combinations: %d", nrow(observed_combinations)))

      # Rename trait column to 'response' for generic modeling
      names(model_data)[names(model_data) == trait] <- "response"
      
      # Fit model for BLUEs estimation (year and accession as fixed effects)
      model_blues <- asreml(
        fixed = response ~ accession + year,
        random = ~ year:trial_replicate, 
        residual = ~ units,
        data = model_data,
        na.action = na.method(y = "exclude", x = "exclude"),
        trace = FALSE,
        workspace = "1gb"
      )
      
      # Check convergence and update if needed for BLUEs model
      iteration <- 1
      status_blues <- check_model_status(model_blues)
      
      while (iteration <= max_iterations && (!status_blues$converged || !status_blues$stable)) {
        model_blues <- update(model_blues)
        status_blues <- check_model_status(model_blues)
        iteration <- iteration + 1
      }
      
      # Fit separate model for heritability calculation (accession as random effect)
      model_h2 <- asreml(
        fixed = response ~ year,
        random = ~ accession + year:trial_replicate,
        residual = ~ units,
        data = model_data,
        na.action = na.method(y = "exclude", x = "exclude"),
        trace = FALSE,
        workspace = "1gb"
      )
      
      # Check convergence and update if needed for heritability model
      iteration_h2 <- 1
      status_h2 <- check_model_status(model_h2)
      
      while (iteration_h2 <= max_iterations && (!status_h2$converged || !status_h2$stable)) {
        model_h2 <- update(model_h2)
        status_h2 <- check_model_status(model_h2)
        iteration_h2 <- iteration_h2 + 1
      }
      
      # Calculate heritability using the random effects model
      h2 <- tryCatch({
        gen_heritability(model_h2)
      }, error = function(e) {
        message(sprintf("Error calculating heritability for trait %s: %s", 
                      trait, e$message))
        return(NA)
      })
      
      # Add heritability to results
      heritability_results <- rbind(heritability_results, 
                                  data.frame(Trait = trait,
                                           Heritability = h2,
                                           Model_Converged = status_h2$converged && status_h2$stable))
      
      # Extract residuals and check normality (using BLUEs model)
      residuals <- resid(model_blues)
      normality_check <- check_residual_normality(residuals, qq_correlation_threshold)
      
      # Use classify approach instead of newdata for predictions
      pred <- predict(model_blues, 
                     classify = "year:accession", 
                     vcov = TRUE,
                     sed = TRUE,
                     pworkspace = 64e6)
      
      # Extract prediction dataframe
      pred_df <- pred$pvals
      
      # Check if predictions are available
      if(is.null(pred_df) || nrow(pred_df) == 0) {
        message(sprintf("No predictions available for trait: %s. Skipping.", trait))
        next
      }

      message(sprintf("Total predictions generated: %d", nrow(pred_df)))
      
      # FILTER PREDICTIONS TO ONLY OBSERVED COMBINATIONS
      pred_df_filtered <- pred_df %>%
        inner_join(observed_combinations, by = c("year", "accession"))
      
      message(sprintf("Predictions after filtering to observed combinations: %d", nrow(pred_df_filtered)))
      
      # Create results dataframe using filtered predictions
      blues_df <- data.frame(
        trait = trait,
        year = pred_df_filtered$year,
        accession = pred_df_filtered$accession,
        BLUE = pred_df_filtered$predicted.value,
        SE = pred_df_filtered$std.error,
        Convergence_Status = ifelse(status_blues$converged && status_blues$stable, 
                                  "Fully converged", "Check convergence"),
        Iterations = iteration - 1,
        QQ_Correlation = normality_check$correlation,
        Normality_Status = normality_check$interpretation
      )
      
      # Generate diagnostic plots
      pdf(file.path(plot_dir, paste0(trait, "_diagnostics.pdf")))
      par(mfrow = c(2, 2))
      
      # Residuals vs Fitted
      plot(fitted(model_blues), residuals,
           main = paste0(trait, "\nResiduals vs Fitted"),
           xlab = "Fitted values", ylab = "Residuals")
      abline(h = 0, col = "red", lty = 2)
      
      # Q-Q plot
      qqnorm(residuals, 
             main = paste0("Normal Q-Q Plot\nCorrelation: ", 
                         round(normality_check$correlation, 3)))
      qqline(residuals, col = "red")
      
      # Histogram of residuals
      hist(residuals, 
           main = paste0("Histogram of residuals\n", 
                        normality_check$interpretation),
           breaks = 30)
      
      # Box plot by year
      boxplot(response ~ year, data = model_data,
              main = "Distribution by Year",
              xlab = "Year", ylab = trait)
      
      dev.off()
      
      # Store results
      all_results[[trait]] <- blues_df
      
    }, error = function(e) {
      message(sprintf("Error in trait %s: %s", trait, e$message))
      all_results[[trait]] <- NULL
    })
  }
  
  # Combine all results into a single dataframe (if any)
  if(length(all_results) > 0) {
    final_results <- do.call(rbind, all_results)
    
    # Write results to CSV files
    write.csv(final_results, 
              file = file.path(output_dir, "trait_BLUEs_results.csv"), 
              row.names = FALSE)
  } else {
    final_results <- NULL
    message("No results were generated for any traits.")
  }
  
  write.csv(heritability_results,
            file = file.path(output_dir, "trait_heritability_results.csv"),
            row.names = FALSE)
  
  return(list(
    blues = final_results,
    heritability = heritability_results
  ))
}

In [11]:
results_pak <- analyze_traits_combined_pak(
  data = pak,
  traits = c('DTF', 'DTHarvM', 'PtHt', 'PcleLng', 'SdLen', 'TGW', 'SdWpPt_z'),
  output_dir = './BLUEs_results/BLUEs_results_PAK'
)

Data summary before analysis:

Total rows: 1167

Rows per year:




2019-20 2020-21 2021-22 
    269     347     551 


Processing trait: DTF

Number of observed year:accession combinations: 843

"Some components changed by more than 1% on the last iteration"
Total predictions generated: 1032

Predictions after filtering to observed combinations: 843

Processing trait: DTHarvM

Number of observed year:accession combinations: 793

"Some components changed by more than 1% on the last iteration"
Total predictions generated: 987

Predictions after filtering to observed combinations: 793

Processing trait: PtHt

Number of observed year:accession combinations: 823

Total predictions generated: 1032

Predictions after filtering to observed combinations: 823

Processing trait: PcleLng

Number of observed year:accession combinations: 837

Total predictions generated: 1029

Predictions after filtering to observed combinations: 837

Processing trait: SdLen

Number of observed year:accession combinations: 773

Total predictions generated: 969

Predictions after filtering to observed combinations: 773

Processing tr