In [2]:
import hail as hl
import pandas as pd

In [3]:
hl.init(default_reference = 'GRCh38',
                tmp_dir = "gs://wes-bipolar-tmp-4day/")


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SPARKMONITOR_LISTENER: Started SparkListener for Jupyter Notebook
SPARKMONITOR_LISTENER: Port obtained from environment: 37731
SPARKMONITOR_LISTENER: Application Started: application_1718122941138_0004 ...Start Time: 1718129946799


Running on Apache Spark version 3.3.2
SparkUI available at http://rye-m.c.wes-bipolar.internal:42135
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.128-eead8100a1c1
LOGGING: writing to /home/hail/hail-20240611-1819-0.2.128-eead8100a1c1.log


In [4]:
# Full manifest before sample-filtering 
MANIFEST = 'gs://2024-wgspd/files/20240523_WGSPD_final-qcd-manifest.tsv'
manifest = hl.import_table(MANIFEST, delimiter='\t',
                          key = "s", impute = True)
manifest.describe()
manifest.count()

2024-06-11 18:19:43.488 Hail: INFO: Reading table to impute column types 1) / 1]
2024-06-11 18:19:45.301 Hail: INFO: Finished type imputation
  Loading field 's' as type str (imputed)
  Loading field 'sex_new' as type str (imputed)
  Loading field 'sex_old' as type str (imputed)
  Loading field 'SEX' as type str (imputed)
  Loading field 'primary_disease_new' as type str (imputed)
  Loading field 'primary_disease_new_fixed' as type str (imputed)
  Loading field 'primary_disease_old' as type str (imputed)
  Loading field 'primary_disease_old_fixed' as type str (imputed)
  Loading field 'PRIMARY_DISEASE' as type str (imputed)
  Loading field 'CASECON' as type str (imputed)
  Loading field 'population_inference.pop' as type str (imputed)


----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'sex_new': str 
    'sex_old': str 
    'SEX': str 
    'primary_disease_new': str 
    'primary_disease_new_fixed': str 
    'primary_disease_old': str 
    'primary_disease_old_fixed': str 
    'PRIMARY_DISEASE': str 
    'CASECON': str 
    'population_inference.pop': str 
----------------------------------------
Key: ['s']
----------------------------------------


28554

In [5]:
# Subset dense MT before variant-filtering (by gnomad)
MT = 'gs://gnomad-subsets-2024/gnomad-v3/202403/20240328_subset_dense-callstats.mt'
mt = hl.read_matrix_table(MT)
# Filter to passing samples
mt = mt.filter_cols(hl.is_defined(manifest[mt.s]))
mt.count()

(588713326, 28554)

In [6]:
# Annotate with phenotype info
mt = mt.annotate_cols(pheno = manifest[mt.s].CASECON)
#mt = mt.annotate_cols(primary_disease = manifest[mt.s].PRIMARY_DISEASE)

### Sites of interest

In [7]:
#f = mt.filter_rows(hl.is_defined(variant_ht[mt.locus, mt.alleles]))
f = hl.filter_intervals(mt, [hl.parse_locus_interval('chr12:120291700-120292000')])
f = f.checkpoint("gs://2024-wgspd/NDD_RNU4-2/20240611_NDD_RNU4-2_SNV_gene-range_passing-samples.mt", overwrite = True)

2024-06-11 18:21:14.550 Hail: INFO: wrote matrix table with 366 rows and 28554 columns in 1 partition to gs://2024-wgspd/NDD_RNU4-2/20240611_NDD_RNU4-2_SNV_gene-range_passing-samples.mt


In [14]:
f = hl.read_matrix_table("gs://2024-wgspd/NDD_RNU4-2/20240611_NDD_RNU4-2_SNV_gene-range_passing-samples.mt")
f.describe()
f.count()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'a_index': int32
    'was_split': bool
    'AC_raw': int32
    'AN_raw': int32
    'AF_raw': float32
    'AC': int32
    'AN': int32
    'AF': float32
----------------------------------------
Entry fields:
    'GT': call
    'DP': int32
    'GQ': int32
    'MIN_DP': int32
    'RGQ': int32
    'END': int32
    'PID': str
    'SB': array<int32>
    'gvcf_info': struct {
        ClippingRankSum: float64, 
        BaseQRankSum: float64, 
        MQ: float64, 
        MQRankSum: float64, 
        MQ_DP: int32, 
        QUALapprox: int32, 
        RAW_MQ: float64, 
        ReadPosRankSum: float64, 
        VarDP: int32
    }
    'PGT': call
    'AD': array<int32>
    'PL': array<int32>
    'adj': bool
------------------

(366, 28554)

In [15]:
f = f.filter_cols(f.pheno != "OTHER")
f = f.filter_rows(f.AC < 50)
f = f.annotate_entries(non_ref = f.GT.is_non_ref())
f.count()

[Stage 20:>                                                         (0 + 1) / 1]

(336, 28554)

In [16]:
manifest.aggregate(hl.agg.counter(manifest.CASECON))

{'CASE': 7700, 'CTRL': 20854}

In [17]:
f_case_con = f.annotate_rows(case_non_ref = hl.agg.count_where(f.non_ref & (f.pheno == "CASE")),
                             con_non_ref = hl.agg.count_where(f.non_ref & (f.pheno == "CTRL")),
                             #case_ref = hl.agg.count_where(~f.non_ref & (f.pheno == "CASE")),
                             #con_ref = hl.agg.count_where(~f.non_ref & (f.pheno == "CTRL")),
                             )
f_case_con = f_case_con.annotate_rows(case_ref = 7700 - f_case_con.case_non_ref,
                                      con_ref = 20854 - f_case_con.con_non_ref
                             )
f_case_con = f_case_con.annotate_rows(fisher = hl.expr.functions.fisher_exact_test(
    hl.int(f_case_con.case_non_ref),
    hl.int(f_case_con.case_ref),
    hl.int(f_case_con.con_non_ref),
    hl.int(f_case_con.con_ref)))
h = f_case_con.rows()
h = h.order_by(h.fisher.p_value)
h.show(n = 336)

2024-06-11 18:29:51.794 Hail: INFO: Ordering unsorted dataset with network shuffle
[Stage 23:>                                                         (0 + 1) / 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,fisher,fisher,fisher,fisher
locus,alleles,rsid,a_index,was_split,AC_raw,AN_raw,AF_raw,AC,AN,AF,case_non_ref,con_non_ref,case_ref,con_ref,p_value,odds_ratio,ci_95_lower,ci_95_upper
locus<GRCh38>,array<str>,str,int32,bool,int32,int32,float32,int32,int32,float32,int64,int64,int64,int64,float64,float64,float64,float64
chr12:120291824,"[""T"",""G""]",,2,True,44,70818,0.000621,41,70462,0.000582,18,2,7682,20852,5.91e-09,24.4,5.85,216.0
chr12:120291865,"[""T"",""A""]",,1,True,6,71038,8.45e-05,6,70988,8.45e-05,5,0,7695,20854,0.00142,inf,2.48,inf
chr12:120291970,"[""C"",""G""]",,2,True,34,71042,0.000479,34,70978,0.000479,15,16,7685,20838,0.0132,2.54,1.17,5.49
chr12:120291875,"[""T"",""C""]",,1,True,9,71044,0.000127,8,70964,0.000113,5,3,7695,20851,0.0375,4.52,0.878,29.1
chr12:120291817,"[""G"",""A""]",,1,True,13,71030,0.000183,13,70954,0.000183,7,6,7693,20848,0.053,3.16,0.909,11.4
chr12:120291706,"[""C"",""A""]",,1,True,4,71036,5.63e-05,4,70974,5.64e-05,3,1,7697,20853,0.0626,8.13,0.652,426.0
chr12:120291870,"[""C"",""T""]",,1,False,5,71038,7.04e-05,5,70946,7.05e-05,3,1,7697,20853,0.0626,8.13,0.652,426.0
chr12:120291944,"[""A"",""C""]",,1,True,9,71024,0.000127,1,70908,1.41e-05,5,4,7695,20850,0.0659,3.39,0.729,17.1
chr12:120291976,"[""G"",""C""]",,1,False,10,71020,0.000141,10,70932,0.000141,0,10,7700,20844,0.0717,0.0,0.0,1.21
chr12:120291748,"[""CTT"",""C""]",,1,True,3,70944,4.23e-05,3,70798,4.24e-05,2,0,7698,20854,0.0727,inf,0.509,inf


In [18]:
h = h.flatten()
h.export("gs://2024-wgspd/NDD_RNU4-2/20240611_NDD_RNU4-2_gene-range_passing-samples_fisher.tsv", delimiter = "\t")

2024-06-11 18:30:09.717 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-06-11 18:30:12.006 Hail: INFO: merging 2 files totalling 44.4K... + 1) / 1]
2024-06-11 18:30:12.300 Hail: INFO: while writing:
    gs://2024-wgspd/NDD_RNU4-2/20240611_NDD_RNU4-2_gene-range_passing-samples_fisher.tsv
  merge time: 293.582ms


In [19]:
variant_df = pd.DataFrame({"locus" : ["chr12:120291865"], "alleles": [["T","A"]]})
variant_ht = hl.Table.from_pandas(variant_df)
variant_ht = variant_ht.annotate(locus = hl.parse_locus(variant_ht.locus))
variant_ht = variant_ht.key_by("locus", "alleles")
#variant_ht = variant_ht.key_by("locus")
variant_ht.show()


i = hl.filter_intervals(mt, [hl.parse_locus_interval('chr12:120291865-120291866')]).persist()
i = i.filter_rows(hl.is_defined(variant_ht[i.locus, i.alleles]))
i = i.filter_cols(i.pheno != "OTHER")
i = i.annotate_entries(non_ref = i.GT.is_non_ref())
i = i.annotate_cols(carrier = hl.agg.count_where(i.non_ref))
i.aggregate_cols(hl.agg.counter(i.carrier))

locus,alleles
locus<GRCh38>,array<str>
chr12:120291865,"[""T"",""A""]"


2024-06-11 18:32:40.388 Hail: INFO: wrote matrix table with 2 rows and 28554 columns in 1 partition to gs://wes-bipolar-tmp-4day/persist_MatrixTableZDM10J25Zs
2024-06-11 18:32:42.255 Hail: WARN: aggregate_cols(): Aggregates over cols ordered by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2024-06-11 18:32:43.818 Hail: INFO: Coerced sorted dataset


{0: 28549, 1: 5}

In [20]:
ic = i.cols()
ic = ic.filter(ic.carrier == 1)
manifest[ic.s].show()

2024-06-11 18:32:47.013 Hail: INFO: Coerced sorted dataset
2024-06-11 18:32:48.668 Hail: INFO: Coerced sorted dataset
2024-06-11 18:32:49.297 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,<expr>,<expr>,<expr>,<expr>,<expr>,<expr>,<expr>,<expr>,<expr>,<expr>
s,sex_new,sex_old,SEX,primary_disease_new,primary_disease_new_fixed,primary_disease_old,primary_disease_old_fixed,PRIMARY_DISEASE,CASECON,population_inference.pop
str,str,str,str,str,str,str,str,str,str,str
"""10C114599""","""Female""","""Female""","""Female""","""Schizophrenia, Paranoid""","""SCZ""","""Schizophrenia, Paranoid""","""SCZ""","""SCZ""","""CASE""","""afr"""
"""11C122337""","""Male""","""Male""","""Male""","""Bipolar I with psychosis""","""BD1""","""Bipolar I with psychosis""","""BD1""","""BD1""","""CASE""","""afr"""
"""MH0188521""","""Female""","""Female""","""Female""","""Schizoaffective disorder, depressive type""","""SCZ""","""Schizoaffective disorder, depressive type""","""SCZ""","""SCZ""","""CASE""","""afr"""
"""MH0192991""","""Female""","""Female""","""Female""","""Schizophrenia, Undifferentiated""","""SCZ""","""Schizophrenia, Undifferentiated""","""SCZ""","""SCZ""","""CASE""","""afr"""
"""MH0196843""","""Female""","""Female""","""Female""","""Bipolar I with psychosis""","""BD1""","""Bipolar I with psychosis""","""BD1""","""BD1""","""CASE""","""afr"""


In [21]:
ic.s.collect()

2024-06-11 18:32:52.007 Hail: INFO: Coerced sorted dataset


['10C114599', '11C122337', 'MH0188521', 'MH0192991', 'MH0196843']