# INTRO

**Author:** Stephan Cordogan

This notebook uses GenomicSEM to perform LDSC between two Finngen Summary Statistics files [(a)](#Download-Finngen-Summary-Statistics), or between modified Finngen Summary Statistics [(b)](#Prepare-Modified-Finngen-File-if-desired) (notebook 5), or AllofUs summary statistics [(c)](#Prepare-Modified-AllofUs-European-file-if-desired) (Notebook 2.12), or AllofUs meta-analyzed summary statistics [(d)](#Prepare-Modified-AllofUs-Meta-Analysis-file-if-desired) (Notebook 2.12).  **Enter file names and other paramaters in when prompted** [(a)](#Download-and-Prepare-Necessary-Files).  Using LDSC directly is not possible on the AllofUs Researcher Workbench as it does not support the necessary python environment, but can be performed locally.

# Install necessary packages

In [None]:
install.packages("devtools")
library(devtools)
install_github("GenomicSEM/GenomicSEM")

In [None]:
require(GenomicSEM)

In [None]:
Sys.setenv(OPENBLAS_NUM_THREADS = 1)
Sys.setenv(OMP_NUM_THREADS = 1)
Sys.setenv(MKL_NUM_THREADS = 1)
Sys.setenv(NUMEXPR_NUM_THREADS = 1)
Sys.setenv(VECLIB_MAXIMUM_THREADS = 1)

In [None]:
library(data.table)

# Download and Prepare Necessary Files

## Download Finngen Summary Statistics

Sample summary statistics

In [None]:
system("gsutil cp gs://finngen-public-data-r12/summary_stats/release/finngen_R12_M13_MENISCUSDERANGEMENTS.gz .")
system("gsutil cp gs://finngen-public-data-r12/summary_stats/release/finngen_R12_M13_ARTHROPATHIES.gz .")

In [None]:
system("gsutil cp gs://finngen-public-data-r12/summary_stats/release/finngen_R12_BMI_IRN.gz .")

## Download SNP list per author reccomendation

In [None]:
url <- "https://zenodo.org/record/7773502/files/w_hm3.snplist.gz"
destfile <- "w_hm3.snplist.gz"  

download.file(url, destfile, mode = "wb")  

system("gzip -d w_hm3.snplist.gz")


## Download European LD Matrices 

This uses a downloaded LD matrix, although the LD matrix generated in notebook 4.1 could also be used.  

In [None]:
gsutil_ref_cmd <- "gsutil -u terra-vpc-sc-5934923d cp gs://broad-alkesgroup-public-requester-pays/LDSCORE/eur_ref_ld_chr.tar.bz2 ."
gsutil_wld_cmd <- "gsutil -u terra-vpc-sc-5934923d cp gs://broad-alkesgroup-public-requester-pays/LDSCORE/eur_w_ld_chr.tar.bz2 ."

system(gsutil_ref_cmd)
system(gsutil_wld_cmd)

extract_ref_cmd <- "tar -xjf eur_ref_ld_chr.tar.bz2"
extract_wld_cmd <- "tar -xjf eur_w_ld_chr.tar.bz2"

system(extract_ref_cmd)
system(extract_wld_cmd)

## Prepare Modified Finngen File if desired

In [None]:
bucket <- Sys.getenv("WORKSPACE_BUCKET")
system(paste("gsutil cp", file.path(bucket, "data/processed_finn.tsv"), "."))

In [None]:
processed_finn <- fread("processed_finn.tsv")
head(processed_finn)

In [None]:
meniscus_data <- fread("finngen_R12_M13_MENISCUSDERANGEMENTS.gz")
head(meniscus_data)

In [None]:
setnames(meniscus_data, old = "#chrom", new = "chrom")

# Perform a merge to match based on chrom, pos, ref, and alt
processed_finn <- merge(
  processed_finn,
  meniscus_data[, .(chrom, pos, ref, alt, rsids)],  # Select relevant columns
  by.x = c("chrom", "pos", "ref", "alt"),           # Columns from processed_finn
  by.y = c("chrom", "pos", "ref", "alt"),           # Columns from meniscus_data
  all.x = TRUE                                      # Keep all rows from processed_finn
)

head(processed_finn)
head(meniscus_data)

In [None]:
setnames(processed_finn, 
         old = c("standard_error", "af_alt_arthro", "pval_m_fixed", "chrom"), 
         new = c("sebeta", "af_alt", "pval", "#chrom"), 
         skip_absent = FALSE)  

head(processed_finn)

output_file <- "processed_finn_1.tsv"
fwrite(processed_finn, output_file, sep = "\t")

system(paste("gzip", output_file))

## Prepare Modified AllofUs European file if desired

In [None]:
bucket <- Sys.getenv("WORKSPACE_BUCKET")
system(paste("gsutil cp", file.path(bucket, "data/merged_eur.tsv"), "."))

In [None]:
aou_eur <- fread("merged_eur.tsv")
head(aou_eur)

In [None]:
aou_eur[, af_alt := 0.5]

# generate chrom column
aou_eur[, `chrom` := sub("^chr(\\d+):.*", "\\1", position)]

# generate pos column
aou_eur[, pos := sub("^chr\\d+:(\\d+).*", "\\1", position)]

head(aou_eur)

In [None]:
setnames(aou_eur, 
         old = c("standard_error", "p_value"), 
         new = c("sebeta", "pval"), 
         skip_absent = TRUE) 

Import rsids

In [None]:
# Load the finngen_R12_M13_MENISCUSDERANGEMENTS.gz file to populate
meniscus_data <- fread("finngen_R12_M13_MENISCUSDERANGEMENTS.gz")

setnames(meniscus_data, old = "#chrom", new = "chrom")

aou_eur[, chrom := as.integer(chrom)]
aou_eur[, pos := as.integer(pos)]

# Perform a merge to match based on chrom, pos, ref, and alt
aou_eur <- merge(
  aou_eur,
  meniscus_data[, .(chrom, pos, ref, alt, rsids)],  # Select relevant columns
  by.x = c("chrom", "pos", "ref", "alt"),           # Columns from aou_eur
  by.y = c("chrom", "pos", "ref", "alt"),           # Columns from meniscus_data
  all.x = TRUE                                      # Keep all rows from processed_finn
)

setnames(aou_eur, old = "chrom", new = "#chrom")

head(aou_eur)


In [None]:
str(aou_eur)

In [None]:
str(meniscus_data)

In [None]:
output_file <- "aou_eur.tsv"
fwrite(aou_eur, output_file, sep = "\t")

system(paste("gzip", output_file))

## Prepare Modified AllofUs Meta-Analysis file if desired

In [None]:
bucket <- Sys.getenv("WORKSPACE_BUCKET")
system(paste("gsutil cp", file.path(bucket, "data/meta_all1_GC.tsv.bgz"), "."))
final_meta = fread("meta_all1_GC.tsv.bgz")
head(final_meta)

In [None]:
final_meta[, af_alt := 0.5]

final_meta[, chrom := sub("^chr(\\d+):.*", "\\1", MarkerName)]

final_meta[, pos := sub("^chr\\d+:(\\d+)_.*", "\\1", MarkerName)]

final_meta[, ref := sub("^chr\\d+:\\d+_([A-Z]+)_.+", "\\1", MarkerName)]

final_meta[, alt := sub("^chr\\d+:\\d+_[A-Z]+_([A-Z]+)", "\\1", MarkerName)]

head(final_meta)

In [None]:
setnames(final_meta, 
         old = c("StdErr", "P-value"), 
         new = c("sebeta", "pval"), 
         skip_absent = TRUE) 

In [None]:
# Load the finngen_R12_M13_MENISCUSDERANGEMENTS.gz file to populate
meniscus_data <- fread("finngen_R12_M13_MENISCUSDERANGEMENTS.gz")

setnames(meniscus_data, old = "#chrom", new = "chrom")

final_meta[, chrom := as.integer(chrom)]
final_meta[, pos := as.integer(pos)]

# Perform a merge to match based on chrom, pos, ref, and alt
final_meta <- merge(
  final_meta,
  meniscus_data[, .(chrom, pos, ref, alt, rsids)],  # Select relevant columns
  by.x = c("chrom", "pos", "ref", "alt"),           # Columns from final_meta
  by.y = c("chrom", "pos", "ref", "alt"),           # Columns from meniscus_data
  all.x = TRUE                                      # Keep all rows from processed_finn
)

setnames(final_meta, old = "chrom", new = "#chrom")

head(final_meta)

In [None]:
output_file <- "final_meta.tsv"
fwrite(final_meta, output_file, sep = "\t")

system(paste("gzip", output_file))

## Function to calculate effective sample size for logistic regression 

In [None]:
calculate_effective_sample_size <- function(cases_controls) {
  if (length(cases_controls) %% 2 != 0) {
    stop("cases_controls must contain an even number of elements (pairs of cases and controls).")
  }
  
  # Split cases and controls into separate vectors
  cases <- cases_controls[seq(1, length(cases_controls), by = 2)]
  controls <- cases_controls[seq(2, length(cases_controls), by = 2)]
  
  # Calculate effective sample size
  effective_sample_sizes <- 4 / (1 / cases + 1 / controls)
  
  return(effective_sample_sizes)
}

# Identify desired GWAS summary statistics and set parameters

In [None]:
#ENTER IN CELL BELOW

input_files <- c("processed_finn_1.tsv.gz", "finngen_R12_BMI_IRN.gz")

trait.names <- c("Meniscus_Derangements_finn", "BMI_IRN_finn")

cases_controls <- c(31840, 468508, 300000, 0)

type <- c("logistic", "linear")  # define the type- for linear, put the sample size in cases and 0 for controls


In [None]:
traits <- paste0(trait.names, ".sumstats.gz")
sample_sizes <- vector("numeric", length(type))
sample_prev <- vector("numeric", length(type))
population_prev <- vector("numeric", length(type))

for (i in seq_along(type)) {
  if (type[i] == "logistic") {
    # Logistic case- calculate effective sample size and prevalences
    sample_sizes[i] <- calculate_effective_sample_size(c(cases_controls[(2 * i - 1):(2 * i)]))
    sample_prev[i] <- cases_controls[2 * i - 1] / sum(cases_controls[(2 * i - 1):(2 * i)])
    population_prev[i] <- sample_prev[i]
  } else if (type[i] == "linear") {
    # Linear case- use number of cases as the sample size, set prevalences to NA
    sample_sizes[i] <- cases_controls[2 * i - 1]
    sample_prev[i] <- NA
    population_prev[i] <- NA
  } else {
    stop("Unknown type specified. Use 'logistic' or 'linear'.")
  }
}

In [None]:
print(input_files)
print(sample_sizes)
print(sample_prev)

# Prepare files for Munging

In [None]:
process_file <- function(input_file, sample_size) {
  data <- fread(input_file)
  
  # Add new columns
  data[, N := sample_size]  # Add N column with  specified sample size
  data[, Z := beta / sebeta]  # Calculate Z score
  data[, INFO := 0.99]  # Add INFO column with default value to avoid errors (not present in our sumstats)
  
  # Rename columns
  setnames(data, 
           old = c("#chrom", "pos", "ref", "alt", "rsids", "pval", "beta", "sebeta", "af_alt"), 
           new = c("CHR", "POS", "A1", "A2", "SNP", "P", "effect", "se", "MAF"))
  
  # Filter based on MAF
  data <- data[MAF >= 0.01]
  
  # Write to a new file with `_2` appended to the name
  temp_file <- sub(".gz$", "_2", input_file)  
  uncompressed_file <- sub(".gz$", "", temp_file)  
  fwrite(data, uncompressed_file, sep = "\t")
  
  system(paste("gzip", uncompressed_file))
}


In [None]:
# Process both files with respective sample sizes
mapply(process_file, input_files, sample_sizes)


# Munge Files

In [None]:
files <- sub(".gz$", "_2.gz", input_files)
# Define hapmap3 file 

munge(
  files = files,
  hm3 = "w_hm3.snplist",
  trait.names = trait.names,
  N = sample_sizes,
  info.filter = 0.8,  
  maf.filter = 0.01,  
  parallel = FALSE,  
  cores = 1,
  overwrite = TRUE
)




# LDSC

In [None]:
ld <- "eur_ref_ld_chr/"  
wld <- "eur_w_ld_chr/"   

ldsc_results <- ldsc(
  traits = traits,
  sample.prev = sample.prev,
  population.prev = population.prev,
  ld = ld,
  wld = wld,
  trait.names = trait.names,  
  chr = 22,  
  n.blocks = 200,  
  stand = TRUE  
)

print(ldsc_results$S)  # Genetic covariance matrix
print(ldsc_results$V)  # Variance-covariance matrix of genetic covariance
print(ldsc_results$I)  # Cross-trait intercepts
print(ldsc_results$N)  # Sample size matrix
print(ldsc_results$m)  # Number of SNPs used
