## INTRO

**Author:** Stephan Cordogan

This notebook uses PC loadings to calculate PCA results for samples defined in notebook 1.11.  This may be necessary if you do not wish to recalculate PCs after AllofUs has released more WGS samples.

## Import Necessary Packages

In [None]:
from datetime import datetime
import os
import pandas as pd
import hail as hl

In [None]:
start = datetime.now()
bucket = os.getenv('WORKSPACE_BUCKET')
bucket
hl.init(default_reference = "GRCh38")

## Project PC loadings onto samples in genotype matrix tables created in 1.11

In [None]:
#Load in PC Loadings
loadings_ht_eur = hl.read_table(f'{bucket}/data/mt_eur_loadings_with_af.ht')
loadings_ht_eur = loadings_ht_eur.key_by('locus', 'alleles')

#Load in sub-ancestry matrix table
mt_eur = hl.read_matrix_table(f'{bucket}/data/mt_eur_filtered.mt')
mt_eur = mt_eur.key_rows_by('locus', 'alleles')
mt_eur = mt_eur.key_cols_by('s')

#Project loadings onto individuals
projected_scores = hl.experimental.pc_project(mt_eur.GT, loadings_ht_eur.loadings, loadings_ht_eur.af)

#Annotate Samples
mt_eur = mt_eur.annotate_cols(
    pca_scores=projected_scores[mt_eur.s].scores
)

#Create Individual Annotations
mt_eur = mt_eur.annotate_cols(
    **{f'PC{i+1}': mt_eur.pca_scores[i] for i in range(10)}
)

#Export PCAs
eur_pcs_table = mt_eur.cols().select('PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10')
mt_eur_pcs_save_path = f'{bucket}/data/mt_eur_pcs.tsv.bgz'
eur_pcs_table.export(mt_eur_pcs_save_path)

In [None]:
loadings_ht_afr = hl.read_table(f'{bucket}/data/mt_afr_loadings_with_af.ht')
loadings_ht_afr = loadings_ht_afr.key_by('locus', 'alleles')

mt_afr = hl.read_matrix_table(f'{bucket}/data/mt_afr_filtered.mt')
mt_afr = mt_afr.key_rows_by('locus', 'alleles')
mt_afr = mt_afr.key_cols_by('s')

projected_scores = hl.experimental.pc_project(mt_afr.GT, loadings_ht_afr.loadings, loadings_ht_afr.af)

mt_afr = mt_afr.annotate_cols(
    pca_scores=projected_scores[mt_afr.s].scores
)

mt_afr = mt_afr.annotate_cols(
    **{f'PC{i+1}': mt_afr.pca_scores[i] for i in range(10)}
)

afr_pcs_table = mt_afr.cols().select('PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10')
mt_afr_pcs_save_path = f'{bucket}/data/mt_afr_pcs.tsv.bgz'
afr_pcs_table.export(mt_afr_pcs_save_path)


In [None]:
loadings_ht_amr = hl.read_table(f'{bucket}/data/mt_amr_loadings_with_af.ht')
loadings_ht_amr = loadings_ht_amr.key_by('locus', 'alleles')

mt_amr = hl.read_matrix_table(f'{bucket}/data/mt_amr_filtered.mt')
mt_amr = mt_amr.key_rows_by('locus', 'alleles')
mt_amr = mt_amr.key_cols_by('s')

projected_scores = hl.experimental.pc_project(mt_amr.GT, loadings_ht_amr.loadings, loadings_ht_amr.af)

mt_amr = mt_amr.annotate_cols(
    pca_scores=projected_scores[mt_amr.s].scores
)

mt_amr = mt_amr.annotate_cols(
    **{f'PC{i+1}': mt_amr.pca_scores[i] for i in range(10)}
)

amr_pcs_table = mt_amr.cols().select('PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10')
mt_amr_pcs_save_path = f'{bucket}/data/mt_amr_pcs.tsv.bgz'
amr_pcs_table.export(mt_amr_pcs_save_path)


In [None]:
loadings_ht_eas = hl.read_table(f'{bucket}/data/mt_eas_loadings_with_af.ht')
loadings_ht_eas = loadings_ht_eas.key_by('locus', 'alleles')

mt_eas = hl.read_matrix_table(f'{bucket}/data/mt_eas_filtered.mt')
mt_eas = mt_eas.key_rows_by('locus', 'alleles')
mt_eas = mt_eas.key_cols_by('s')

projected_scores = hl.experimental.pc_project(mt_eas.GT, loadings_ht_eas.loadings, loadings_ht_eas.af)

mt_eas = mt_eas.annotate_cols(
    pca_scores=projected_scores[mt_eas.s].scores
)

mt_eas = mt_eas.annotate_cols(
    **{f'PC{i+1}': mt_eas.pca_scores[i] for i in range(10)}
)

eas_pcs_table = mt_eas.cols().select('PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10')
mt_eas_pcs_save_path = f'{bucket}/data/mt_eas_pcs.tsv.bgz'
eas_pcs_table.export(mt_eas_pcs_save_path)


In [None]:
loadings_ht_sas = hl.read_table(f'{bucket}/data/mt_sas_loadings_with_af.ht')
loadings_ht_sas = loadings_ht_sas.key_by('locus', 'alleles')

mt_sas = hl.read_matrix_table(f'{bucket}/data/mt_sas_filtered.mt')
mt_sas = mt_sas.key_rows_by('locus', 'alleles')
mt_sas = mt_sas.key_cols_by('s')

projected_scores = hl.experimental.pc_project(mt_sas.GT, loadings_ht_sas.loadings, loadings_ht_sas.af)

mt_sas = mt_sas.annotate_cols(
    pca_scores=projected_scores[mt_sas.s].scores
)

mt_sas = mt_sas.annotate_cols(
    **{f'PC{i+1}': mt_sas.pca_scores[i] for i in range(10)}
)

sas_pcs_table = mt_sas.cols().select('PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10')
mt_sas_pcs_save_path = f'{bucket}/data/mt_sas_pcs.tsv.bgz'
sas_pcs_table.export(mt_sas_pcs_save_path)
