# Intro

**Author:** Stephan Cordogan

This notebook performs a logistic regression across AllofUs genomic data for each of 5 ancestry sub-populations, using principal components calculated in notebooks 1.11 and 1.12.  The resulting summary statistics are saved to the workspace bucket.  For ease of computation (this can cost hundreds or thousands of dollars), the GWAS can be split up by chromosomes, run over each set of chromosomes, and recombined in the next notebook. This works better than splitting up only by ancestry, as memory requirements are drastically higher for the ancestries with higher populations.  Additionally, overall memory requirement is lower for the same volume of data when split by ancestry because each ancestry is iteratevely saved to a file.  **Simply specify the desired test_intervals** [(a)](#Split-up-by-chromosomes-if-desired) **and change the numerical suffix in the save path- you may also change the overall and ancestry-specific minor allele frequencies** [(b)](#Run-GWASes).  I recommend running chromosomes in these clusters ["1", "2", "3", "45", "67", "89", "101112", "131415", "161718", "19202122"].  

# Set up GWASes

## Set up Notebook

Import Necessary Packages

In [None]:
from datetime import datetime
import os
import pandas as pd
import hail as hl


In [None]:
start = datetime.now()
bucket = os.getenv('WORKSPACE_BUCKET')
bucket
hl.init
hl.default_reference("GRCh38")


In [None]:
!gsutil ls $WORKSPACE_BUCKET/data


## Load Hail MatrixTable containing the variants

In [None]:
# mt_path = os.getenv("WGS_CLINVAR_SPLIT_HAIL_PATH")
mt_path = os.getenv("WGS_ACAF_THRESHOLD_SPLIT_HAIL_PATH")
# mt_path = os.getenv("WGS_ACAF_THRESHOLD_MULTI_HAIL_PATH")
mt_path

### Split up by chromosomes if desired

In [None]:
mt = hl.read_matrix_table(mt_path)
# mt.describe()

In [None]:
# test_intervals = ['chr6:54000000-57000000']
# test_intervals = ['chr3','chr4','chr5','chr6', 'chr7', 'chr8']
# test_intervals = ['chr6']
test_intervals = ['chr19', 'chr20', 'chr21', 'chr22']
# test_intervals = ['chr22']

mt = hl.filter_intervals(
    mt,
    [hl.parse_locus_interval(x,)
     for x in test_intervals])

# mt_qual = hl.filter_intervals(
#     mt_qual,
#     [hl.parse_locus_interval(x,)
#      for x in test_intervals])
# mt.count()

In [None]:
# mt.count()

## Load phenotypic data and link with genomic data

In [None]:
phenotype_filename = f'{bucket}/data/genomics_phenotypes.tsv'
phenotype_filename
phenotypes = (hl.import_table(phenotype_filename,
                              types={'person_id':hl.tstr},
                              impute=True,
                              key='person_id')
             )
mt = mt.semi_join_cols(phenotypes)
mt = mt.annotate_cols(pheno = phenotypes[mt.s])

## Pre-process the genomic data

Remove related samples

In [None]:
related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv"
related_remove = hl.import_table(related_samples_path,
                                 types={"sample_id":"tstr"},
                                key="sample_id")

#related_remove.count()
mt = mt.anti_join_cols(related_remove)
#mt.count()

Link predicted ancestry for filtering

In [None]:
ancestry_pred_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv"
ancestry_pred = hl.import_table(ancestry_pred_path,
                               key="research_id", 
                               impute=True, 
                               types={"research_id":"tstr","pca_features":hl.tarray(hl.tfloat)})
mt = mt.annotate_cols(ancestry_pred = ancestry_pred[mt.s])

Optionally generate summary statistics of each group

In [None]:
# col_table = mt.cols()
# result_table = (
#     col_table
#     .group_by(col_table.ancestry_pred.ancestry_pred)  # Group by ancestry prediction
#     .aggregate(
#         cases=hl.agg.sum(col_table.pheno.has_pheno == 1),  # Count of cases
#         controls=hl.agg.sum(col_table.pheno.has_pheno == 0),  # Count of controls
#         num_males=hl.agg.sum(col_table.pheno.is_male == 1),  # Count of males
#         num_females=hl.agg.sum(col_table.pheno.is_female),  # Count of females
#         mean_age=hl.agg.mean(col_table.pheno.age_yrs)  # Mean age
#     )
# )

# result_table.show()

# Run GWASes

In [None]:
used_ancestries = hl.literal({"eur", "afr", "amr", "eas", "sas"})
mt = mt.filter_cols(used_ancestries.contains(mt.ancestry_pred.ancestry_pred))

# OVERALL MINOR ALLELE FREQUENCY THRESHOLD
mt = mt.filter_rows(hl.min(mt.info.AF) > 0.001, keep=True)

# Define PCA scores file paths and save paths for each ancestry
pca_files = {
    "eur": f'{bucket}/data/mt_eur_pcs.tsv.bgz',
    "afr": f'{bucket}/data/mt_afr_pcs.tsv.bgz',
    "amr": f'{bucket}/data/mt_amr_pcs.tsv.bgz',
    "eas": f'{bucket}/data/mt_eas_pcs.tsv.bgz',
    "sas": f'{bucket}/data/mt_sas_pcs.tsv.bgz'
}

# POPULATION SPECIFIC MINOR ALLELE FREQUENCY THRESHOLD
allele_freq_thresholds = {
    "eur": 0.001,
    "afr": 0.005,
    "amr": 0.005,
    "eas": 0.01,
    "sas": 0.01
}

# Run logistic regression for each ancestry
for ancestry, pca_file in pca_files.items():
    # Filter to specific ancestry and re-annotate rows
    mt_ancestry = mt.filter_cols(mt.ancestry_pred.ancestry_pred == ancestry)
    mt_ancestry = mt_ancestry.annotate_rows(info=hl.agg.call_stats(mt_ancestry.GT, mt_ancestry.alleles))
    
    min_af = allele_freq_thresholds[ancestry]
    mt_ancestry = mt_ancestry.filter_rows(hl.min(mt_ancestry.info.AF) > min_af, keep=True)
    
    # Import and join PCA scores
    mt_ancestry_pcs = hl.import_table(pca_file, impute=True)
    mt_ancestry_pcs = mt_ancestry_pcs.annotate(s=hl.str(mt_ancestry_pcs.s))
    mt_ancestry_pcs = mt_ancestry_pcs.key_by('s')
    
    # Annotate columns with PCA scores
    mt_ancestry = mt_ancestry.annotate_cols(
        pca_scores=hl.struct(
            PC1=mt_ancestry_pcs[mt_ancestry.s].PC1,
            PC2=mt_ancestry_pcs[mt_ancestry.s].PC2,
            PC3=mt_ancestry_pcs[mt_ancestry.s].PC3,
            PC4=mt_ancestry_pcs[mt_ancestry.s].PC4,
            PC5=mt_ancestry_pcs[mt_ancestry.s].PC5,
            PC6=mt_ancestry_pcs[mt_ancestry.s].PC6,
            PC7=mt_ancestry_pcs[mt_ancestry.s].PC7,
            PC8=mt_ancestry_pcs[mt_ancestry.s].PC8,
            PC9=mt_ancestry_pcs[mt_ancestry.s].PC9,
            PC10=mt_ancestry_pcs[mt_ancestry.s].PC10
        )
    )

    # Define covariates, including PCA scores
    covariates = [1.0, mt_ancestry.pheno.is_male, mt_ancestry.pheno.age_yrs] + [
        mt_ancestry.pca_scores[f'PC{i}'] for i in range(1, 11)
    ]

    # Perform logistic regression
    log_reg = hl.logistic_regression_rows(
        test='wald',
        y=mt_ancestry.pheno.has_pheno,
        x=mt_ancestry.GT.n_alt_alleles(),
        covariates=covariates
    )

    # Flatten and export results
    log_reg_flat = log_reg.flatten()
    log_reg_save_path = f'{bucket}/data/log_reg_{ancestry}_19202122.tsv.bgz'
    log_reg_flat.export(log_reg_save_path)

    print(f"Logistic regression for {ancestry} completed and saved to {log_reg_save_path}")
