In [2]:
import hail as hl
import pandas as pd

In [3]:
hl.init(default_reference = 'GRCh38',
                tmp_dir = "gs://wes-bipolar-tmp-4day/")


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SPARKMONITOR_LISTENER: Started SparkListener for Jupyter Notebook
SPARKMONITOR_LISTENER: Port obtained from environment: 40323
SPARKMONITOR_LISTENER: Application Started: application_1718034285666_0003 ...Start Time: 1718040682589


Running on Apache Spark version 3.3.2
SparkUI available at http://rye-m.c.wes-bipolar.internal:45189
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.128-eead8100a1c1
LOGGING: writing to /home/hail/hail-20240610-1731-0.2.128-eead8100a1c1.log


In [4]:
# Full manifest before sample-filtering 
MANIFEST = 'gs://2024-wgspd/files/2024_WGSPD_merged-manifest.tsv'
manifest = hl.import_table(MANIFEST, delimiter='\t',
                          key = "s", impute = True)
manifest.count()

2024-06-10 17:31:43.263 Hail: INFO: Reading table to impute column types 1) / 1]
2024-06-10 17:31:45.276 Hail: INFO: Finished type imputation
  Loading field 's' as type str (imputed)
  Loading field 'sex_new' as type str (imputed)
  Loading field 'sex_old' as type str (imputed)
  Loading field 'SEX' as type str (imputed)
  Loading field 'primary_disease_new' as type str (imputed)
  Loading field 'primary_disease_new_fixed' as type str (imputed)
  Loading field 'primary_disease_old' as type str (imputed)
  Loading field 'primary_disease_old_fixed' as type str (imputed)
  Loading field 'PRIMARY_DISEASE' as type str (imputed)
  Loading field 'CASECON' as type str (imputed)


35527

In [5]:
manifest.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'sex_new': str 
    'sex_old': str 
    'SEX': str 
    'primary_disease_new': str 
    'primary_disease_new_fixed': str 
    'primary_disease_old': str 
    'primary_disease_old_fixed': str 
    'PRIMARY_DISEASE': str 
    'CASECON': str 
----------------------------------------
Key: ['s']
----------------------------------------


In [11]:
# Subset dense MT before variant-filtering (by gnomad)
MT = 'gs://gnomad-subsets-2024/gnomad-v3/202403/20240328_subset_dense-callstats.mt'
mt = hl.read_matrix_table(MT)
mt.count()

(588713326, 35527)

In [12]:
# Annotate with phenotype info
mt = mt.annotate_cols(is_case = manifest[mt.s].CASECON == "CASE")
mt = mt.annotate_cols(primary_disease = manifest[mt.s].PRIMARY_DISEASE)

### Sites of interest

In [13]:
pos = [120291839, 120291839, 120291826, 120291827, 120291835, 120291838, #Single base insertions
       120291839, 120291826, 120291828, 120291835, 120291837, 120291841] #SNVs
variant_list = ["chr12:" + str(p) for p in pos]
alleles =[["T","TA"], ["T","TC"], ["T","TA"], ["T","TA"], ["G","GT"], ["T","TA"], #Single base insertions
          ["T","C"], ["T","G"], ["G","A"], ["G","A"], ["T","C"], ["A","C"]] #SNVs
assert(len(pos) == len(alleles))

In [16]:
variant_df = pd.DataFrame({"locus" : variant_list, "alleles": alleles})
variant_ht = hl.Table.from_pandas(variant_df)
variant_ht = variant_ht.annotate(locus = hl.parse_locus(variant_ht.locus))
#variant_ht = variant_ht.key_by("locus", "alleles")
variant_ht = variant_ht.key_by("locus")

In [17]:
#f = mt.filter_rows(hl.is_defined(variant_ht[mt.locus, mt.alleles]))
f = mt.filter_rows(hl.is_defined(variant_ht[mt.locus]))
f = f.checkpoint("gs://2024-wgspd/NDD_RNU4-2/20240610_NDD_RNU4-2_SNV_locus-only.mt")

2024-06-10 17:40:13.358 Hail: INFO: Ordering unsorted dataset with network shuffle
Exception in thread "Thread-39" java.lang.NullPointerException + 1640) / 115376]
	at sparkmonitor.listener.JupyterSparkMonitorListener$TaskUpdaterThread.$anonfun$run$1(CustomListener.scala:116)
	at scala.collection.TraversableLike$grouper$1$.apply(TraversableLike.scala:465)
	at scala.collection.TraversableLike$grouper$1$.apply(TraversableLike.scala:455)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at scala.collection.TraversableLike.groupBy(TraversableLike.scala:524)
	at scala.collection.TraversableLike.groupBy$(TraversableLike.scala:454)
	at scala.collection.AbstractTraversable.groupBy(Traversable.scala:108)
	at sparkmonitor.listener.JupyterSparkMonitorListener$TaskUpdaterThread.run(CustomListener.scala:116)
	at java.base/ja

In [18]:
f = hl.read_matrix_table("gs://2024-wgspd/NDD_RNU4-2/20240610_NDD_RNU4-2_SNV_locus-only.mt")
f.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'is_case': bool
    'primary_disease': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'a_index': int32
    'was_split': bool
    'AC_raw': int32
    'AN_raw': int32
    'AF_raw': float32
    'AC': int32
    'AN': int32
    'AF': float32
----------------------------------------
Entry fields:
    'GT': call
    'DP': int32
    'GQ': int32
    'MIN_DP': int32
    'RGQ': int32
    'END': int32
    'PID': str
    'SB': array<int32>
    'gvcf_info': struct {
        ClippingRankSum: float64, 
        BaseQRankSum: float64, 
        MQ: float64, 
        MQRankSum: float64, 
        MQ_DP: int32, 
        QUALapprox: int32, 
        RAW_MQ: float64, 
        ReadPosRankSum: float64, 
        VarDP: int32
    }
    'PGT': call
    'AD': array<int32>
    'PL': array<int32>
    

In [19]:
f = f.annotate_entries(non_ref = f.GT.is_non_ref())

In [20]:
f_case_con = f.annotate_rows(case_non_ref = hl.agg.count_where(f.non_ref & f.is_case),
                             con_non_ref = hl.agg.count_where(f.non_ref & ~f.is_case))
f_case_con.rows().show(n = len(pos))



locus,alleles,rsid,a_index,was_split,AC_raw,AN_raw,AF_raw,AC,AN,AF,case_non_ref,con_non_ref
locus<GRCh38>,array<str>,str,int32,bool,int32,int32,float32,int32,int32,float32,int64,int64
chr12:120291826,"[""T"",""C""]",,1,False,1,71030,1.41e-05,1,70920,1.41e-05,1,0
chr12:120291827,"[""T"",""C""]",,1,True,1,71038,1.41e-05,0,70956,0.0,1,0
chr12:120291827,"[""T"",""TG""]",,3,True,1,71038,1.41e-05,1,70956,1.41e-05,0,1
chr12:120291827,"[""TG"",""T""]",,2,True,1,71038,1.41e-05,1,70956,1.41e-05,1,0
chr12:120291828,"[""G"",""A""]",,1,False,1,71028,1.41e-05,1,70932,1.41e-05,0,1


In [21]:
f.count()

(5, 35527)

In [None]:
# Use about chr12-120291763 to about chr12-120291903