# INTRO

**Author:** Stephan Cordogan

This notebook is began with a Python3 kernel, Hail environment, then should be switched to R kernel, general environment, with at least 500GB of disk space and 200GB RAM. This generates PLINK files of the cohort created in notebook 1, filtered to only include variants in your summary statistics file to restrict size, and then uses the summary statistics file as well as a computed LD matrix to generate a PRS which is tested on your cohort. **Specify the desired minor allele frequency of the genotype files and the maximum tolerable frequency of unsequenced alleles [(a)](#Specify-parameters-of-PLINK-files), the name and path of your summary statistics [(b)](#Filter-by-presence-in-sumstats-to-decrease-size-of-PLINK-files) [(c)](#Load-in-Necessary-Files) [(d)](#Load-and-modify-base-summary-statistics), and the cases and controls of your summary statistics [(e)](#Enter-cases-and-controls-in-n_eff).**

# Create necesary files

## Import Necessary Packages 

In [None]:
from datetime import datetime
import os
import pandas as pd
import hail as hl
hl.init(default_reference = "GRCh38")

In [None]:
# start = datetime.now()
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

In [None]:
!gsutil ls $WORKSPACE_BUCKET/data

## Prepare ld panel files

In [None]:
mt_path = os.getenv("WGS_ACAF_THRESHOLD_SPLIT_HAIL_PATH")
mt = hl.read_matrix_table(mt_path)

In [None]:
related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv"
related_remove = hl.import_table(related_samples_path,
                                 types={"sample_id":"tstr"},
                                key="sample_id")

#related_remove.count()
mt = mt.anti_join_cols(related_remove)

In [None]:
phenotype_filename = f'{bucket}/data/genomics_phenotypes.tsv'
phenotype_filename
phenotypes = (hl.import_table(phenotype_filename,
                              types={'person_id':hl.tstr},
                              impute=True,
                              key='person_id')
             )
mt = mt.semi_join_cols(phenotypes)
mt = mt.annotate_cols(pheno = phenotypes[mt.s])

In [None]:
mt.describe()

In [None]:
mt.count()

In [None]:
mt = mt.annotate_cols(
    family_id=mt.s,  
    sample_id=mt.s,  
    paternal_id='0',  
    maternal_id='0',  
    affection=hl.if_else(
        hl.is_defined(mt.pheno.has_pheno),
        hl.if_else(mt.pheno.has_pheno == 1, 2, 1),  
        -9  
    )
) 


## Specify parameters of PLINK files

In [None]:
ancestry_pred_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv"
ancestry_pred = hl.import_table(ancestry_pred_path,
                               key="research_id", 
                               impute=True, 
                               types={"research_id":"tstr","pca_features":hl.tarray(hl.tfloat)})

mt = mt.annotate_cols(ancestry_pred = ancestry_pred[mt.s])

mt = mt.filter_cols(mt.ancestry_pred.ancestry_pred == "eur")

mt = mt.annotate_rows(info=hl.agg.call_stats(mt.GT, mt.alleles))

mt = mt.filter_rows(hl.min(mt.info.AF) > .01, keep=True)

print("MatrixTable count:", mt.count())

PRSICE2_mt = mt

threshold = 0.05

# Calculate proportion of missing entries per row 
PRSICE2_mt = PRSICE2_mt.annotate_rows(
    missingness=hl.agg.mean(hl.is_missing(PRSICE2_mt.GT))  
)

PRSICE2_mt = PRSICE2_mt.filter_rows(PRSICE2_mt.missingness <= threshold)

print("MatrixTable count:", PRSICE2_mt.count())

mt_save_path = f'{bucket}/data/PRSICE2_mt.mt'
PRSICE2_mt.write(mt_save_path)


In [None]:
mt_save_path = f'{bucket}/data/PRSICE2_mt.mt'
PRSICE2_mt = hl.read_matrix_table(mt_save_path)

In [None]:
# first_locus = PRSICE2_mt.rows().take(1)
# print(first_locus[0].locus)

In [None]:
PRSICE2_mt.describe()

## Filter by presence in sumstats to decrease size of PLINK files

In [None]:
log_reg_finn_path = f'{bucket}/data/processed_finn.tsv'

In [None]:
import os
print(os.getcwd())

In [None]:
variant_table = hl.import_table(
    log_reg_finn_path, 
    no_header=False,  
    impute=True  
)

In [None]:
variant_table.show(5)


In [None]:
# replace 23 with X
variant_table = variant_table.annotate(
    chrom=hl.if_else(variant_table.chrom == 23, "X", hl.str(variant_table.chrom))
)



In [None]:
variant_table.show(5)

## Annotate columns , filter for Presence in Summary statistics, write to PLINK files

In [None]:
variant_table = variant_table.rename({'locus': 'full_locus'})

variant_table = variant_table.annotate(
    locus=hl.locus('chr' + hl.str(variant_table.chrom), hl.int32(variant_table.pos), reference_genome='GRCh38')
)

variant_table = variant_table.key_by('locus', 'ref', 'alt')

PRSICE2_mt = PRSICE2_mt.filter_rows(hl.len(PRSICE2_mt.alleles) == 2)

PRSICE2_mt = PRSICE2_mt.annotate_cols(
    family_id=PRSICE2_mt.family_id,     
    sample_id=PRSICE2_mt.sample_id,     
    paternal_id=PRSICE2_mt.paternal_id, 
    maternal_id=PRSICE2_mt.maternal_id, 
    is_female=hl.if_else(               
        PRSICE2_mt.pheno.is_female, True,
        hl.if_else(hl.is_defined(PRSICE2_mt.pheno.is_female), False, hl.null(hl.tbool))
    )
)

#Filter Rows based on Presence in Variant Table

PRSICE2_mt = PRSICE2_mt.filter_rows(
    hl.is_defined(variant_table[hl.struct(
        locus=PRSICE2_mt.locus,
        ref=PRSICE2_mt.alleles[0],
        alt=PRSICE2_mt.alleles[1]
    )])
)


print("MatrixTable count:", PRSICE2_mt.count())

# PRSICE2_mt.count()

output_path = f'{bucket}/data/ld_panel/ld_panel_files'

hl.export_plink(
    dataset=PRSICE2_mt,  # Use the fully re-annotated MatrixTable
    output=output_path,
    fam_id=PRSICE2_mt.family_id,       
    ind_id=PRSICE2_mt.sample_id,       
    pat_id=PRSICE2_mt.paternal_id,     
    mat_id=PRSICE2_mt.maternal_id,     
    is_female=PRSICE2_mt.is_female,    
    pheno=PRSICE2_mt.affection            
)

In [None]:
!gsutil ls {bucket}/data/ld_panel

## Prepare covariate file with age if desired

In [None]:
cols_table = mt.cols()  # Extract the column annotations into a Table

mt = mt.annotate_cols(age_yrs=mt.pheno.age_yrs)

mt.describe()

In [None]:
covariates_table = cols_table.select(
    FID=cols_table['s'],  
    IID=cols_table['s'],
    age=cols_table['age_yrs']  
)


cov_save_path = f'{bucket}/data/covariates_table.tsv'
covariates_table.export(cov_save_path)

# !gsutil cp ld_panel_files.* {panel_save_path}

# Run PRS across files

## Switch to R Environment

Using Instructions from https://privefl.github.io/bigsnpr-extdoc/polygenic-scores-pgs.html

## Install Packages

In [None]:
system("sudo apt-get install libblas-dev liblapack-dev")
system("sudo apt-get install gfortran")
system("sudo apt-get install build-essential")
system("sudo apt-get install libcurl4-openssl-dev libssl-dev")

In [None]:
system('dpkg -l | grep -E "libblas-dev|liblapack-dev|gfortran|build-essential|libcurl4-openssl-dev|libssl-dev"')

In [None]:
system('dpkg -l | grep -E "libblas-dev|liblapack-dev|gfortran|build-essential|libcurl4-openssl-dev|libssl-dev"', intern = TRUE)

In [None]:
install.packages("bigsnpr", verbose = TRUE)

In [None]:
library(bigsnpr)

## Load in Necessary Files

In [None]:
system("gsutil cp gs://fc-secure-f8b6c774-d7fc-411b-90cf-bc425b194d81/data/processed_finn.tsv .")
system("gsutil cp -r gs://fc-secure-f8b6c774-d7fc-411b-90cf-bc425b194d81/data/ld_panel/ .")
system("gsutil cp -r gs://fc-secure-f8b6c774-d7fc-411b-90cf-bc425b194d81/data/covariates_table.tsv .")

## Begin workflow

### Designate components of target files

In [None]:
bigparallelr::set_blas_ncores(1)
options(bigstatsr.check.parallel.blas = FALSE)
NCORES <- nb_cores()

In [None]:
bedfile <- "ld_panel/ld_panel_files.bed"  

rdsfile <- "ld_panel/ld_panel_files.rds"

if (!file.exists(rdsfile)) {
  rdsfile <- snp_readBed2(bedfile, ncores = NCORES)
}

# Attach the .rds file
obj.bigsnp <- snp_attach(rdsfile)

# View the structure
str(obj.bigsnp, max.level = 2)

# Access components
G <- obj.bigsnp$genotypes    # Genotype matrix
map <- obj.bigsnp$map        # Variant information
fam <- obj.bigsnp$fam  

In [None]:
# NCORES <- nb_cores()
map <- dplyr::transmute(obj.bigsnp$map,
                        chr = chromosome, pos = physical.pos,
                        a0 = allele2, a1 = allele1)

### Load and modify base summary statistics

In [None]:
# file_path <- "finn_formatted_sumstats.tsv"
file_path <- "processed_finn.tsv"

readLines(file_path, n = 3)

# modify relevant columns
sumstats <- bigreadr::fread2(
  file_path,
  sep = "\t",  
  select    = c("chrom", "pos", "ref", "alt", "beta", "standard_error", "af_alt_arthro"), #These columns are renamed below
  col.names = c("chr", "pos", "a0", "a1", "beta", "beta_se", "freq")       
)

head(sumstats)

### Optionally, filter for p-value

Computations scale exponentially with number of SNPs

In [None]:
# sumstats_old <- sumstats

sumstats$p_value <- 2 * pnorm(-abs(sumstats$beta / sumstats$beta_se))

sumstats$chi_squared <- (sumstats$beta / sumstats$beta_se)^2

median_obs <- median(sumstats$chi_squared, na.rm = TRUE)

expected_median <- qchisq(0.5, df = 1)

lambda_gc <- median_obs / expected_median

print(lambda_gc)

# Filter the dataset to keep only rows with desired characteristics
# sumstats <- sumstats[sumstats$p_value <= 0.25, ]
sumstats <- sumstats[sumstats$freq >= 0.01, ]



In [None]:
# Check the first few rows of the filtered dataset
sumstats <- subset(sumstats, select = -p_value)

head(sumstats)

In [None]:
nrow(sumstats)

### Enter cases and controls in n_eff

In [None]:
calculate_effective_sample_size <- function(cases, controls) {
  if (!is.numeric(cases) || !is.numeric(controls)) {
    stop("Both cases and controls must be numeric.")
  }
    n_eff <- 4 / (1 / cases + 1 / controls)
  return(n_eff)
}

cases <- 31840
controls <- 468508

n_eff <- calculate_effective_sample_size(cases, controls)

sumstats$n_eff <- n_eff 

In [None]:
library(dplyr)  # Load the dplyr package for %>%

In [None]:
map <- map %>%
  mutate(chr = case_when(
    chr == "chrX" ~ "chr23",
    chr == "chrY" ~ NA_character_,  # mark 'chrY' as NA to filter it out later
    TRUE ~ chr  # Keep other chromosomes as-is
  )) %>%
  filter(!is.na(chr)) %>%  # Remove rows with NA in 'chr'
  mutate(chr = as.integer(gsub("chr", "", chr)))

In [None]:
unique(map$chr)


### Match SNPs between both files

In [None]:
#info_snp <- snp_match(sumstats, map, return_flip_and_rev = TRUE) %>% 
#  mutate(freq = ifelse(`_REV_`, 1 - freq, freq), 
#         `_REV_` = NULL, `_FLIP_`= NULL) %>% 
#  print()

# Match summary statistics to genetic map without flipping or frequency operations
df_beta <- snp_match(sumstats, map, return_flip_and_rev = FALSE)

In [None]:
unique(df_beta$chr)

In [None]:
# df_beta = df_beta1

In [None]:
install.packages("R.utils")
library(R.utils)

In [None]:
map <- map[!is.na(map$chr), ]
df_beta <- df_beta[!is.na(df_beta$chr), ]

In [None]:
map_has_na <- any(is.na(map))
cat("NAs in map", map_has_na, "\n")
df_beta_has_na <- any(is.na(df_beta))
cat("NAs in df_beta", df_beta_has_na, "\n")

In [None]:
head(map)
head(sumstats)


In [None]:
dir.create("tmp-data", showWarnings = FALSE)


### Reconfigure environment for better deliberate parallelization

In [None]:
G <- snp_fastImputeSimple(G, method = "mean2", ncores = 8)

In [None]:
# df_beta1 <-df_beta

In [None]:
# Precomputed genetic positions (in cM) to avoid downloading large files in this tuto
gen_pos <- readRDS(runonce::download_file(
  "https://figshare.com/ndownloader/files/38247288",
  dir = "tmp-data", fname = "gen_pos_tuto.rds"))

df_beta1 <- dplyr::filter(df_beta, chr %in% 1:22)  # TO REMOVE (for speed here)

for (chr in 1:22) {  # REPLACE BY 1:22
    
  print(chr)
  
  corr0 <- runonce::save_run({
    
    ## indices in 'sumstats'
    ind.chr <- which(df_beta1$chr == chr)
    ## indices in 'G'
    ind.chr2 <- df_beta1$`_NUM_ID_`[ind.chr]
    
    # genetic positions (in cM)
    POS2 <- snp_asGeneticPos(map$chr[ind.chr2], map$pos[ind.chr2], dir = "tmp-data")
#     POS2 <- gen_pos[ind.chr2]  # USE snp_asGeneticPos() IN REAL CODE
    
    # compute the banded correlation matrix in sparse matrix format
    snp_cor(G, ind.col = ind.chr2, size = 3 / 1000, infos.pos = POS2, 
            ncores = NCORES)
    
  }, file = paste0("tmp-data/corr_chr", chr, ".rds"))
    
  # transform to SFBM (on-disk format) on the fly
  if (chr == 1) {
    ld <- Matrix::colSums(corr0^2)
    corr <- as_SFBM(corr0, "tmp-data/corr", compact = TRUE)
  } else {
    ld <- c(ld, Matrix::colSums(corr0^2))
    corr$add_columns(corr0, nrow(corr))
  }
}

In [None]:
print(dim(corr))
print(nrow(df_beta1))

In [None]:
summary(ld)  
head(ld)     
length(ld)   

### Heritability Estimate

In [None]:
(ldsc <- with(df_beta1, snp_ldsc(ld, length(ld), chi2 = (beta / beta_se)^2,
                                 sample_size = n_eff, blocks = NULL)))

In [None]:
ldsc_h2_est <- ldsc[["h2"]]

In [None]:
colnames(obj.bigsnp$fam)

In [None]:
obj.bigsnp$fam$affection <- ifelse(obj.bigsnp$fam$affection == 2, 1, 0)
obj.bigsnp$fam$sex <- ifelse(obj.bigsnp$fam$sex == 2, 1, 0)

In [None]:
unique(obj.bigsnp$fam$sex)       
unique(obj.bigsnp$fam$affection)      

### Run PRS

In [None]:
# LDpred2-inf
beta_inf <- snp_ldpred2_inf(corr, df_beta1, ldsc_h2_est)
# Predict genetic values
pred_inf <- big_prodVec(G, beta_inf, ind.col = df_beta1$`_NUM_ID_`)
AUCBoot(pred_inf, obj.bigsnp$fam$affection)

In [None]:
ldsc_h2_est <- 0.05

In [None]:
(params <- expand.grid(p = signif(seq_log(1e-4, 0.5, length.out = 16), 2),
                       h2 = round(ldsc_h2_est, 4), sparse = TRUE))

In [None]:
beta_grid <- snp_ldpred2_grid(corr, df_beta1, params, ncores = NCORES)
params$sparsity <- colMeans(beta_grid == 0)

In [None]:
library(ggplot2)

In [None]:
pred_grid <- big_prodMat(G, beta_grid, ind.col = df_beta1[["_NUM_ID_"]],
                         ncores = NCORES)

params$score <- apply(pred_grid, 2, function(x) {
  if (all(is.na(x))) return(NA)  # models that diverged substantially
  summary(glm(
    affection ~ x + sex, data = obj.bigsnp$fam, family = "binomial"
  ))$coef["x", 3]
})

ggplot(params, aes(x = p, y = score, color = as.factor(h2))) +
  theme_bigstatsr() +
  geom_point() +
  geom_line() +
  scale_x_log10(breaks = 10^(-5:0), minor_breaks = params$p) +
  facet_wrap(~ sparse, labeller = label_both) +
  labs(y = "GLM Z-Score", color = "h2") +
  theme(legend.position = "top", panel.spacing = unit(1, "lines"))

In [None]:
library(dplyr)
best_beta_grid <- params %>%
  mutate(id = row_number()) %>%
  arrange(desc(score)) %>%
  slice(1) %>%
  pull(id) %>%
  beta_grid[, .]

In [None]:
# LDpred2-auto
multi_auto <- snp_ldpred2_auto(
  corr, df_beta1, h2_init = ldsc_h2_est,
  vec_p_init = seq_log(1e-4, 0.2, 30),        
  allow_jump_sign = FALSE,
  shrink_corr = 0.95,
  ncores = NCORES)

In [None]:
# `range` should be between 0 and 2
(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est))))

In [None]:
(keep <- (range > (0.95 * quantile(range, 0.95))))

In [None]:
final_beta_auto <- 
  rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est))

In [None]:
final_pred_auto <- big_prodVec(G, final_beta_auto,
                               ind.col = df_beta1[["_NUM_ID_"]],
                               ncores = NCORES)
AUCBoot(final_pred_auto, obj.bigsnp$fam$affection)

In [None]:
str(final_beta_auto)

In [None]:
library(ggplot2)

prs_data_unstd <- data.frame(
  PRS = final_pred_auto,
  Status = ifelse(obj.bigsnp$fam$affection == 1, "Cases", "Controls")  # 1 for cases, 0 for controls
)

# Standardize PRS
prs_data <- prs_data_unstd %>%
  mutate(Standardized_PRS = scale(PRS))

mean_cases <- mean(prs_data$Standardized_PRS[prs_data$Status == "Cases"])
mean_controls <- mean(prs_data$Standardized_PRS[prs_data$Status == "Controls"])

# Plot 
p <- ggplot(prs_data, aes(x = Standardized_PRS, fill = Status)) +
  geom_density(alpha = 0.6) +  
  scale_fill_manual(values = c("Cases" = "blue", "Controls" = "yellow")) +
  labs(
    title = "Distribution of Polygenic Risk Scores for Cases and Controls",
    x = "Standardized PRS",
    y = "Density"
  ) +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    text = element_text(size = 12),
    plot.background = element_rect(fill = "white", color = NA),  # Set background to white
    panel.background = element_rect(fill = "white", color = NA) # Set panel background to white
  ) +
  geom_vline(xintercept = mean_cases, color = "blue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = mean_controls, color = "yellow", linetype = "dashed", size = 1) +
  annotate("text", x = 2, y = 0.4, label = "P < 0.05", size = 5, color = "black", hjust = 1.1)

print(p)
ggsave("prs_density_plot.png", plot = p, width = 8, height = 6, dpi = 300)

Examine a portion of data

In [None]:
# filtered_snps_chr6 <- subset(
#   used_snps,
#   chr == 6 & pos >= 56000000 & pos <= 57000000
# )

# print(filtered_snps_chr6)

OR per SD Increase

In [None]:
#Standardize PRS by SD
prs_std <- final_pred_auto / sd(final_pred_auto)
#Run Regression
model <- glm(obj.bigsnp$fam$affection ~ prs_std, family = "binomial")
summary(model)
#Compute OR per SD increase
or_per_sd <- exp(coef(model)["prs_std"])
or_per_sd
#Compute CI for OR
ci_per_sd <- exp(confint(model)["prs_std", ])
ci_per_sd

Save files

In [None]:
write.table(prs_data_unstd, file = "prs_data.tsv", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
bucket <- Sys.getenv("WORKSPACE_BUCKET") 
meta_save_path <- paste0(bucket, "/data/prs_data.tsv")
system(paste("gsutil cp prs_data.tsv", meta_save_path))

In [None]:
valid_df_beta <- df_beta1[!is.na(df_beta1[["_NUM_ID_"]]), ]
valid_df_beta$final_beta <- final_beta_auto
used_snps <- valid_df_beta[, c("chr", "pos", "a0", "a1", "final_beta")]
used_snps <- used_snps[used_snps$final_beta != 0, ]
used_snps <- na.omit(used_snps)
nrow(used_snps)

In [None]:
head(used_snps)

In [None]:
write.table(used_snps, file = "used_snps_with_weights.txt", sep = "\t", row.names = FALSE, quote = FALSE)
system("gzip -9 used_snps_with_weights.txt")

## Lassosum2

In [None]:
beta_lassosum2 <- snp_lassosum2(
  corr, df_beta1, ncores = NCORES)

In [None]:
pred_grid2 <- big_prodMat(G, beta_lassosum2, ind.col = df_beta1[["_NUM_ID_"]],
                          ncores = NCORES)

params2 <- attr(beta_lassosum2, "grid_param")
params2$score <- apply(pred_grid2, 2, function(x) {
  if (all(is.na(x))) return(NA)  # models that diverged substantially
  summary(glm(
    affection ~ x + sex, data = obj.bigsnp$fam, family = "binomial"
  ))$coef["x", 3]
})

ggplot(params2, aes(x = lambda, y = score, color = as.factor(delta))) +
  theme_bigstatsr() +
  geom_point() +
  geom_line() +
  scale_x_log10(breaks = 10^(-5:0)) +
  labs(y = "GLM Z-Score", color = "delta")

In [None]:
best_grid_lassosum2 <- params2 %>%
  mutate(id = row_number()) %>%
  arrange(desc(score)) %>%
  slice(1) %>%
  pull(id) %>% 
  beta_lassosum2[, .]

In [None]:
best_pred_lassosum2 <- big_prodVec(
  G, 
  best_grid_lassosum2, 
  ind.col = df_beta1[["_NUM_ID_"]], 
  ncores = NCORES
)

AUCBoot(best_pred_lassosum2, obj.bigsnp$fam$affection)

In [None]:
prs_data_unstd <- data.frame(
  PRS = best_pred_lassosum2,
  Status = ifelse(obj.bigsnp$fam$affection == 1, "Cases", "Controls")  # 1 for cases, 0 for controls
)

# Standardize PRS
prs_data <- prs_data_unstd %>%
  mutate(Standardized_PRS = scale(PRS))

mean_cases <- mean(prs_data$Standardized_PRS[prs_data$Status == "Cases"])
mean_controls <- mean(prs_data$Standardized_PRS[prs_data$Status == "Controls"])

# Plot 
p <- ggplot(prs_data, aes(x = Standardized_PRS, fill = Status)) +
  geom_density(alpha = 0.6) +  
  scale_fill_manual(values = c("Cases" = "blue", "Controls" = "yellow")) +
  labs(
    title = "Distribution of Polygenic Risk Scores for Cases and Controls",
    x = "Standardized PRS",
    y = "Density"
  ) +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    text = element_text(size = 12),
    plot.background = element_rect(fill = "white", color = NA),  # Set background to white
    panel.background = element_rect(fill = "white", color = NA) # Set panel background to white
  ) +
  geom_vline(xintercept = mean_cases, color = "blue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = mean_controls, color = "yellow", linetype = "dashed", size = 1) +
  annotate("text", x = 2, y = 0.4, label = "P < 0.05", size = 5, color = "black", hjust = 1.1)

print(p)
ggsave("prs_density_plot.png", plot = p, width = 8, height = 6, dpi = 300)

In [None]:
#Standardize PRS by SD
prs_std <- best_pred_lassosum2 / sd(best_pred_lassosum2)
#Run Regression
model <- glm(obj.bigsnp$fam$affection ~ prs_std, family = "binomial")
summary(model)
#Compute OR per SD increase
or_per_sd <- exp(coef(model)["prs_std"])
or_per_sd
#Compute CI for OR
ci_per_sd <- exp(confint(model)["prs_std", ])
ci_per_sd

In [None]:
write.table(prs_data_unstd, file = "prs_data_lasso.tsv", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
bucket <- Sys.getenv("WORKSPACE_BUCKET") 
meta_save_path <- paste0(bucket, "/data/prs_data_lasso.tsv")
system(paste("gsutil cp prs_data_lasso.tsv", meta_save_path))

In [None]:
valid_df_beta <- df_beta1[!is.na(df_beta1[["_NUM_ID_"]]), ]
valid_df_beta$final_beta <- best_grid_lassosum2
used_snps <- valid_df_beta[, c("chr", "pos", "a0", "a1", "final_beta")]
used_snps <- used_snps[used_snps$final_beta != 0, ]
used_snps <- na.omit(used_snps)
nrow(used_snps)

In [None]:
head(used_snps)

In [None]:
write.table(used_snps, file = "used_snps_with_weights_lasso.txt", sep = "\t", row.names = FALSE, quote = FALSE)
system("gzip -9 used_snps_with_weights_lasso.txt")