# Burden test for chr X

## Create annotation file with annovar 

In [4]:
tpl_file=~/working/bioworkflows/admin/csg.yml
annovar_dir=/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/
annovar_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
annovar_sbatch=/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/chrX_annotation_$(date +"%Y-%m-%d").sbatch
bfiles=/mnt/vast/hpc/csg/en2509/exome/All_qced_exome200k.bim
name_prefix="chrX"
build="hg38"
walltime="60h"
mem="80G"

annovar_args="""annovar
    --cwd $annovar_dir 
    --bim_name $bfiles 
    --humandb /mnt/vast/hpc/csg/isabelle/REF/humandb  
    --xref_path /mnt/vast/hpc/csg/isabelle/REF/humandb 
    --job_size 1 
    --build $build
    --name_prefix $name_prefix
    --walltime $walltime
    --mem $mem
    --container_annovar /mnt/vast/hpc/csg/containers/gatk4-annovar.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $annovar_sos \
    --to-script $annovar_sbatch \
    --args "$annovar_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/chrX_annotation_2022-06-21.sbatch[0m
INFO: Workflow csg (ID=w888e7b04827ea2d8) is executed successfully with 1 completed step.


## Create the anno_file, set_list_file and mask_files necessary for burden test

In [5]:
gzip -k /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv

In [4]:
burden_dir=/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/
anno_sbatch_burden=/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/chrX_burdenfiles_$(date +"%Y-%m-%d").sbatch
annotated_file_hg38=/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv.gz
bim_name=/mnt/vast/hpc/csg/en2509/exome/All_qced_exome200k.bim
job_size=1
name_prefix='chrX_burden_files'
anno_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
tpl_file=~/working/bioworkflows/admin/csg.yml
container_annovar=/mnt/vast/hpc/csg/guangyou/containers/gatk4-annovar.sif

anno_args="""burden_files
    --cwd $burden_dir
    --annotated_file $annotated_file_hg38
    --bim_name $bim_name
    --name_prefix $name_prefix
    --job_size $job_size
    --container_annovar $container_annovar
    --container_lmm /mnt/vast/hpc/csg/containers/lmm.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $anno_sos \
    --to-script $anno_sbatch_burden\
    --args "$anno_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/chrX_burdenfiles_2022-06-23.sbatch[0m
INFO: Workflow csg (ID=w038634fbdfa5a2d6) is executed successfully with 1 completed step.


In [2]:
awk --field-separator="," "{ print NF }" /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv | sort -n | uniq

136


In [None]:
cat > /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv.mask_file << EOF
Mask1 LoF
Mask2 LoF,missense
EOF

## Fix the problem of sex chr name for annovar pipeline

Annovar pipeline creates the varID using "23" as the chr name but in our chr X bim file it uses "X", which leads to mistch. 

In [3]:
ls /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/

All_qced_exome200k.hg38.avinput
All_qced_exome200k.hg38.ensGene.invalid_input
All_qced_exome200k.hg38.hg38_multianno.csv
All_qced_exome200k.hg38.hg38_multianno.csv.aff_file
All_qced_exome200k.hg38.hg38_multianno.csv.anno_file
[0m[01;31mAll_qced_exome200k.hg38.hg38_multianno.csv.bak.gz[0m
All_qced_exome200k.hg38.hg38_multianno.csv.err
[01;31mAll_qced_exome200k.hg38.hg38_multianno.csv.gz[0m
All_qced_exome200k.hg38.hg38_multianno.csv.mask_file
All_qced_exome200k.hg38.hg38_multianno.csv.out
All_qced_exome200k.hg38.hg38_multianno.csv.set_list_file
All_qced_exome200k.hg38.hg38_multianno.err
All_qced_exome200k.hg38.hg38_multianno.out
All_qced_exome200k.hg38.invalid_input
All_qced_exome200k.hg38.knownGene.invalid_input
All_qced_exome200k.hg38.refGene.invalid_input
All_qced_exome200k.hg38.refGeneWithVer.invalid_input
chrX_annotation_2022-06-21-2553416.out
chrX_annotation_2022-06-21-2554591.out
chrX_annotation_2022-06-21.log
chrX_annotation_2022-06-21.sbatch
chrX_burdenfiles_2022-06-23-2554

In [5]:
head /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv.anno_file

chr23:334222:G:A NONE other
chr23:334223:C:A NONE other
chr23:334224:A:G NONE other
chr23:334229:A:C NONE other
chr23:334232:C:T NONE other
chr23:334233:G:A NONE other
chr23:334241:GAAT:G NONE other
chr23:334241:GAATA:G NONE other
chr23:334243:A:C NONE other
chr23:334248:T:C NONE other


In [7]:
head /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv.set_list_file

. 23 101360035 chr23:15244692:T:GA,chr23:50607728:T:CTCC,chr23:67546514:T:CGGC,chr23:101360035:T:GAG,chr23:120438731:C:AAAAA,chr23:120438731:C:AAAAAAA,chr23:120438731:C:AAAAAAAA,chr23:154781342:AG:T


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [8]:
head /mnt/vast/hpc/csg/en2509/exome/All_qced_exome200k.bim

23	chrX:334222:G:A	0	334222	A	G
23	chrX:334223:C:A	0	334223	A	C
23	chrX:334224:A:G	0	334224	G	A
23	chrX:334229:A:C	0	334229	C	A
23	chrX:334232:C:T	0	334232	T	C
23	chrX:334233:G:A	0	334233	A	G
23	chrX:334241:GAAT:G	0	334241	G	GAAT
23	chrX:334241:GAATA:G	0	334241	G	GAATA
23	chrX:334243:A:C	0	334243	C	A
23	chrX:334248:T:C	0	334248	C	T


In [6]:
head /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv 

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,AAChange.refGeneWithVer,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,Xref.ensGene,phastConsElements30way,encRegTfbsClustered,gwasCatalog,AF,AF_raw,AF_male,AF_female,AF_afr,AF_ami,AF_amr,AF_asj,AF_eas,AF_fin,AF_nfe,AF_oth,AF_sas,AF_exome,AF_popmax_exome,AF_male_exome,AF_female_exome,AF_raw_exome,AF_afr_exome,AF_sas_exome,AF_amr_exome,AF_eas_exome,AF_nfe_exome,AF_fin_exome,AF_asj_exome,AF_oth_exome,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,GME_AF,GME_NWA,GME_NEA,GME_AP,GME_Israel,GME_SD,GME_TP,GME_CA,Kaviar_AF,Kaviar_AC,Kaviar_AN,avsnp150,DamagePredCount,SIFT_pred,SIFT4G_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,LRT_pred,MutationT

In [14]:
import pandas as pd
df = pd.read_csv('/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv', header=0, dtype='string', index_col=False)
df

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,CLNDISDB,CLNREVSTAT,CLNSIG,DN ID,Patient ID,Phenotype,Platform,Study,Pubmed ID,Otherinfo1
0,23,334222,334222,G,A,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:334222:G:A
1,23,334223,334223,C,A,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:334223:C:A
2,23,334224,334224,A,G,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:334224:A:G
3,23,334229,334229,A,C,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:334229:A:C
4,23,334232,334232,C,T,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:334232:C:T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294835,23,155774815,155774815,G,C,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:155774815:G:C
294836,23,155774816,155774816,G,A,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:155774816:G:A
294837,23,155774828,155774828,A,G,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:155774828:A:G
294838,23,155774836,155774836,C,G,intergenic,NONE;NONE,dist=NONE;dist=NONE,.,.,...,.,.,.,.,.,.,.,.,.,chrX:155774836:C:G


In [11]:
df["Chr"].value_counts()

23    294840
Name: Chr, dtype: Int64

In [15]:
df.loc[df["Chr"]=="23","Chr"]="X"
df["Chr"].value_counts()

X    294840
Name: Chr, dtype: Int64

In [17]:
df.to_csv("/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.rename_chrX.hg38.hg38_multianno.csv",index=False)

In [18]:
gzip -k /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.rename_chrX.hg38.hg38_multianno.csv

## Error: no masks are left to be included in the analysis

In [20]:
import pandas as pd
df = pd.read_csv('/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k.hg38.hg38_multianno.csv.anno_file', header=None, sep=" ")
df

Unnamed: 0,0,1,2
0,chrX:334222:G:A,NONE,other
1,chrX:334223:C:A,NONE,other
2,chrX:334224:A:G,NONE,other
3,chrX:334229:A:C,NONE,other
4,chrX:334232:C:T,NONE,other
...,...,...,...
294817,chrX:155774815:G:C,NONE,other
294818,chrX:155774816:G:A,NONE,other
294819,chrX:155774828:A:G,NONE,other
294820,chrX:155774836:C:G,NONE,other


In [22]:
df[2].value_counts()

other    294822
Name: 2, dtype: int64

In [23]:
df[1].value_counts()

NONE    294814
.            8
Name: 1, dtype: int64

There's no gene annotation as well

## Change the 23 to X in bim file

In [2]:
import pandas as pd
bim = pd.read_csv('/mnt/vast/hpc/csg/en2509/exome/All_qced_exome200k.bim', header=None, sep="\t")
bim

Unnamed: 0,0,1,2,3,4,5
0,23,chrX:334222:G:A,0,334222,A,G
1,23,chrX:334223:C:A,0,334223,A,C
2,23,chrX:334224:A:G,0,334224,G,A
3,23,chrX:334229:A:C,0,334229,C,A
4,23,chrX:334232:C:T,0,334232,T,C
...,...,...,...,...,...,...
294835,23,chrX:155774815:G:C,0,155774815,C,G
294836,23,chrX:155774816:G:A,0,155774816,A,G
294837,23,chrX:155774828:A:G,0,155774828,G,A
294838,23,chrX:155774836:C:G,0,155774836,G,C


In [3]:
bim[0].value_counts()

23    294840
Name: 0, dtype: int64

In [4]:
bim[0]="X"
bim[0].value_counts()

X    294840
Name: 0, dtype: int64

In [5]:
bim.to_csv("/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k_rename_chrX.bim",index=False,header=False,sep="\t")

Change the title from

```
Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,AAChange.refGeneWithVer,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,Xref.ensGene,phastConsElements30way,encRegTfbsClustered,gwasCatalog,AF,AF_raw,AF_male,AF_female,AF_afr,AF_ami,AF_amr,AF_asj,AF_eas,AF_fin,AF_nfe,AF_oth,AF_sas,AF,AF_popmax,AF_male,AF_female,AF_raw,AF_afr,AF_sas,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,GME_AF,GME_NWA,GME_NEA,GME_AP,GME_Israel,GME_SD,GME_TP,GME_CA,Kaviar_AF,Kaviar_AC,Kaviar_AN,avsnp150,DamagePredCount,SIFT_pred,SIFT4G_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,LRT_pred,MutationTaster_pred,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,VEST4_score,MetaSVM_pred,MetaLR_pred,M-CAP_pred,REVEL_score,MutPred_score,MVP_score,MPC_score,PrimateAI_pred,DEOGEN2_pred,BayesDel_addAF_pred,BayesDel_noAF_pred,ClinPred_pred,LIST-S2_pred,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_pred,fathmm-XF_coding_pred,Eigen-raw_coding,Eigen-phred_coding,Eigen-PC-raw_coding,Eigen-PC-phred_coding,GenoCanyon_score,integrated_fitCons_score,GM12878_fitCons_score,H1-hESC_fitCons_score,HUVEC_fitCons_score,LINSIGHT,GERP++_NR,GERP++_RS,phyloP100way_vertebrate,phyloP30way_mammalian,phyloP17way_primate,phastCons100way_vertebrate,phastCons30way_mammalian,phastCons17way_primate,bStatistic,Interpro_domain,GTEx_V8_gene,GTEx_V8_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,DN ID,Patient ID,Phenotype,Platform,Study,Pubmed ID,Otherinfo1
```

```
Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,AAChange.refGeneWithVer,Func.knownGene,Gene.knownGene,GeneDetail.knownGene,ExonicFunc.knownGene,AAChange.knownGene,Func.ensGene,Gene.ensGene,GeneDetail.ensGene,ExonicFunc.ensGene,AAChange.ensGene,Xref.ensGene,phastConsElements30way,encRegTfbsClustered,gwasCatalog,AF,AF_raw,AF_male,AF_female,AF_afr,AF_ami,AF_amr,AF_asj,AF_eas,AF_fin,AF_nfe,AF_oth,AF_sas,AF_exome,AF_popmax_exome,AF_male_exome,AF_female_exome,AF_raw_exome,AF_afr_exome,AF_sas_exome,AF_amr_exome,AF_eas_exome,AF_nfe_exome,AF_fin_exome,AF_asj_exome,AF_oth_exome,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,GME_AF,GME_NWA,GME_NEA,GME_AP,GME_Israel,GME_SD,GME_TP,GME_CA,Kaviar_AF,Kaviar_AC,Kaviar_AN,avsnp150,DamagePredCount,SIFT_pred,SIFT4G_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,LRT_pred,MutationTaster_pred,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,VEST4_score,MetaSVM_pred,MetaLR_pred,M-CAP_pred,REVEL_score,MutPred_score,MVP_score,MPC_score,PrimateAI_pred,DEOGEN2_pred,BayesDel_addAF_pred,BayesDel_noAF_pred,ClinPred_pred,LIST-S2_pred,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_pred,fathmm-XF_coding_pred,Eigen-raw_coding,Eigen-phred_coding,Eigen-PC-raw_coding,Eigen-PC-phred_coding,GenoCanyon_score,integrated_fitCons_score,GM12878_fitCons_score,H1-hESC_fitCons_score,HUVEC_fitCons_score,LINSIGHT,GERP++_NR,GERP++_RS,phyloP100way_vertebrate,phyloP30way_mammalian,phyloP17way_primate,phastCons100way_vertebrate,phastCons30way_mammalian,phastCons17way_primate,bStatistic,Interpro_domain,GTEx_V8_gene,GTEx_V8_tissue,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,DN ID,Patient ID,Phenotype,Platform,Study,Pubmed ID,Otherinfo1
```

In [8]:
gzip -k /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k_rename_chrX.hg38.hg38_multianno.csv

In [9]:
import pandas as pd
df = pd.read_csv('/mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k_rename_chrX.hg38.hg38_multianno.csv.anno_file', header=None, sep=" ")
df

Unnamed: 0,0,1,2
0,chrX:334222:G:A,PPP2R3B,other
1,chrX:334223:C:A,PPP2R3B,other
2,chrX:334224:A:G,PPP2R3B,other
3,chrX:334229:A:C,PPP2R3B,other
4,chrX:334232:C:T,PPP2R3B,other
...,...,...,...
294817,chrX:155774815:G:C,SPRY3,other
294818,chrX:155774816:G:A,SPRY3,other
294819,chrX:155774828:A:G,SPRY3,other
294820,chrX:155774836:C:G,SPRY3,other


In [10]:
df[1].value_counts()

DMD             3736
PLXNB3          2633
HUWE1           2403
FLNA            2193
PLXNA3          1918
                ... 
MIR934             4
LOC101927830       4
SNORA56            3
INE1               3
CT45A2             2
Name: 1, Length: 786, dtype: int64

In [11]:
df[2].value_counts()

other         151771
missense       91539
synonymous     47056
LoF             4456
Name: 2, dtype: int64

In [12]:
cat > /mnt/vast/hpc/csg/en2509/exome/BurdenTest/BurdenFiles/All_qced_exome200k_rename_chrX.hg38.hg38_multianno.csv.mask_file << EOF
Mask1 LoF
Mask2 LoF,missense
EOF