In [2]:
%load_ext autoreload

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%autoreload 2
%aimport AD_predictor_tools
%aimport AD_comparison_tools
%aimport PlottingTools
%aimport uniprotBedTools

1. Get genomic coordinates of all SFARI ADs, save as a bed file &rarr; SFARI_ADs.bed
2. Get genomic coordinates of all SFARI TFs, save as a bed file &rarr; SFARI_TFs.bed 
3. Run bedtools subtract to get bed file of SFARI TF regions that are not ADs &rarr; SFARI_TFs_without_ADs.bed
4. Save SFARI variants as bed file  &rarr; SFARI_variants.bed
5. Use bedtools intersect to compare 1 and 4  (SFARI_ADs.bed vs. SFARI_variants.bed)
6. Use bedtools intersect to compare 3 and 4  (SFARI_TFs_without_ADs.bed vs. SFARI_variants.bed)

In [4]:
# 1. Get genomic coordinates of all SFARI ADs, save as a bed file
SFARI_TF_ADs = pd.read_csv("../output/SFARI_TF_known_ADs.csv", index_col = 0)
SFARI_TF_ADs

Unnamed: 0,Gene,Start,End,uniprotID,Reference,ProteinSeq
0,MEIS2,340,477,O14770,"activation_regions.txt, GSL",DQSNRAGFLLDPSVSQGAAYSPEGQPMGSFVLDGQQHMGIRPAGLQ...
1,KLF7,1,100,O75840,Stanford,MDVLASYSIFQELQLVHDTGYFSALPSLEETWQQTCLELERYLQTE...
2,CAMTA2,471,580,O94983,Stanford,PSPAPLEPSSRVGRGEALFGGPVGASELEPFSLSSFPDLMGELISD...
3,CAMTA2,285,468,O94983,"PMID: 16678093, Soto / Stanford",KAHTSPSSSSSSSSSGFAEPLEIRPSPPTSRGGSSRGGTAILLLTG...
4,NKX2-2,220,273,O95096,"PMID: 10944215, Soto",AQDLAAATFQAGIPFSAYSAQSLQHMQYNAQYSSASTPQYPTAHPL...
...,...,...,...,...,...,...
58,KDM5B,1301,1390,Q9UGL1,Stanford,TNKVSQPPGTTSFSLPDDWDNRTSYLHSPFSTGRSCIPLHGVSPEV...
59,TCF20,1,327,Q9UGU0,"PMID: 10995766, Soto",MQSFREQSSYHGNQQSYPQEVHGSSRLEEFSPRQAQMFQNFGGTGG...
60,MYT1L,151,425,Q9UL68,"Stanford / Stanford / PMID: 29291346, Soto / S...",DEEEEEEEEEEEEEEEENEDHQMNCHNTRIMQDTEKDDNNNDEYDN...
61,TBX22,401,480,Q9Y458,Stanford,QSLAPLMMEVPMLSSLGVTNSKSGSSEDSSDQYLQAPNSTNQMLYG...


In [5]:
AD_coord_dfs = []
for i in SFARI_TF_ADs.index:
    uniprotID = SFARI_TF_ADs["uniprotID"].iloc[i]
    Start = SFARI_TF_ADs["Start"].iloc[i]
    End = SFARI_TF_ADs["End"].iloc[i]
    AD_coord_dfs.append(uniprotBedTools.return_domain_bed_df(uniprotID, Start, End, indexing = 0))



In [6]:
AD_coord_df = pd.concat(AD_coord_dfs)
AD_coord_df = AD_coord_df.reset_index(drop = True)
AD_coord_df

Unnamed: 0,chr,start,end
0,15,36892175,36892459
1,15,36895150,36895261
2,15,36896627,36896646
3,2,207124206,207124404
4,2,207165466,207165568
...,...,...,...
163,2,1922493,1923263
164,2,1942981,1943036
165,X,80030748,80030988
166,7,26183700,26183784


In [7]:
uniprotBedTools.save_bed_from_df(AD_coord_df, "../data/SFARI_ADs.bed")

---

In [8]:
# 2. Get genomic coordinates of all SFARI TFs, save as a bed file → SFARI_ADs.bed

In [9]:
SFARI_TFs = pd.read_csv("../data/SFARI_TFs.csv")
SFARI_TFs

Unnamed: 0.1,Unnamed: 0,status,gene-symbol,gene-name,ensembl-id,chromosome,genetic-category,gene-score,syndromic,eagle,number-of-reports,GeneName,ProteinSeq,uniprotID
0,0,9,ADNP,Activity-dependent neuroprotector homeobox,ENSG00000101126,20,"Rare Single Gene Mutation, Syndromic, Functional",1.0,1,41.50,64,sp|Q9H2P0|ADNP_HUMAN,MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDF...,Q9H2P0
1,1,9,AHDC1,AT-hook DNA binding motif containing 1,ENSG00000126705,1,"Rare Single Gene Mutation, Syndromic",1.0,1,14.25,24,sp|Q5TGY3|AHDC1_HUMAN,MRVKPQGLVVTSSAVCSSPDYLREPKYYPGGPPTPRPLLPTRPPAS...,Q5TGY3
2,2,9,ARID2,AT-rich interaction domain 2,ENSG00000189079,12,"Rare Single Gene Mutation, Syndromic",2.0,1,,12,sp|Q68CP9|ARID2_HUMAN,MANSTGKAPPDERRKGLAFLDELRQFHHSRGSPFKKIPAVGGKELD...,Q68CP9
3,3,9,ARNT2,aryl-hydrocarbon receptor nuclear translocator 2,ENSG00000172379,15,"Rare Single Gene Mutation, Syndromic, Genetic ...",2.0,0,,15,sp|Q9HBZ2|ARNT2_HUMAN,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,Q9HBZ2
4,4,9,ARX,aristaless related homeobox,ENSG00000004848,X,"Rare Single Gene Mutation, Syndromic",1.0,1,13.80,24,sp|Q96QS3|ARX_HUMAN,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,Q96QS3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,122,9,ZNF711,zinc finger protein 711,ENSG00000147180,X,Rare Single Gene Mutation,2.0,0,,5,sp|Q9Y462|ZN711_HUMAN,MDSGGGSLGLHTPDSRMAHTMIMQDFVAGMAGTAHIDGDHIVVSVP...,Q9Y462
123,123,9,ZNF713,Zinc finger protein 713,ENSG00000178665,7,Rare Single Gene Mutation,2.0,0,,3,sp|Q8N859|ZN713_HUMAN,MEEEEMNDGSQMVRSQESLTFQDVAVDFTREEWDQLYPAQKNLYRD...,Q8N859
124,124,9,ZNF774,Zinc finger protein 774,ENSG00000196391,15,Rare Single Gene Mutation,2.0,0,,4,sp|Q6NX45|ZN774_HUMAN,MWLGTSGKSGLPGHCLENPLQECHPAQLEEWALKGISRPSVISQPE...,Q6NX45
125,125,9,ZNF804A,Zinc finger protein 804A,ENSG00000170396,2,"Rare Single Gene Mutation, Genetic Association...",2.0,0,,16,sp|Q7Z570|Z804A_HUMAN,MECYYIVISSTHLSNGHFRNIKGVFRGPLSKNGNKTLDYAEKENTI...,Q7Z570


In [10]:
coordinate_dfs = []

In [11]:
for uniprotID in SFARI_TFs["uniprotID"]:
    coordinate_dfs.append(uniprotBedTools.return_protein_bed_df(uniprotID, indexing = 0))



In [12]:
SFARI_TF_coords = pd.concat(coordinate_dfs)
SFARI_TF_coords

Unnamed: 0,chr,start,end
590,20,50903888,50903996
589,20,50902016,50902109
592,20,50891404,50894512
54,1,27547303,27552115
434,12,45729836,45729928
...,...,...,...
1249,4,145775788,145775960
1250,4,145774505,145774672
1251,4,145765546,145765738
1252,4,145764987,145765165


In [13]:
uniprotBedTools.save_bed_from_df(SFARI_TF_coords, "../data/SFARI_TFs.bed")

---

In [14]:
# 3. Run bedtools subtract to get bed file of SFARI TF regions that are not ADs 
# → SFARI_TFs_without_ADs.bed

! bedtools subtract -a ../data/SFARI_TFs.bed -b ../data/SFARI_ADs.bed > ../output/SFARI_TFs_without_ADs.bed

---

In [15]:
# 4. Save SFARI variants as bed file
SFARI_variants = pd.read_csv('../data/iWES_v2.gatk.pvcf_variants.tsv', sep = '\t')
SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af
0,chr1,69081,G,C,0.000187
1,chr1,69130,C,G,9e-06
2,chr1,69134,A,G,4.2e-05
3,chr1,69135,A,G,5e-06
4,chr1,69149,T,A,9e-06
...,...,...,...,...,...
12443148,chrY,25044036,CTGG,"CTGAG,C","1.9e-05,5e-06"
12443149,chrY,25044040,T,A,1.9e-05
12443150,chrY,25044041,CT,C,1.9e-05
12443151,chrY,25044046,G,A,1.4e-05


In [16]:
# VCF files use one-based indexing
# So, to convert to bed, need to adjust
# ex.) 1 would become: start at 0, end at 1
# ex.) 3 would become: start at 2, end at 3

SFARI_variants_bed = SFARI_variants
SFARI_variants_bed = SFARI_variants_bed.rename(columns = {"chrom" : "chr"})
SFARI_variants_bed["start"] = SFARI_variants_bed["pos"] - 1
SFARI_variants_bed["end"] = SFARI_variants_bed["pos"]
SFARI_variants_bed = SFARI_variants_bed[["chr", "start", "end"]]
SFARI_variants_bed

Unnamed: 0,chr,start,end
0,chr1,69080,69081
1,chr1,69129,69130
2,chr1,69133,69134
3,chr1,69134,69135
4,chr1,69148,69149
...,...,...,...
12443148,chrY,25044035,25044036
12443149,chrY,25044039,25044040
12443150,chrY,25044040,25044041
12443151,chrY,25044045,25044046


In [17]:
uniprotBedTools.save_bed_from_df(SFARI_variants_bed, "../data/SFARI_variants.bed", add_chr = False)

---

In [18]:
# 4. Use bedtools intersect to compare 1 and 4 (SFARI_ADs.bed vs. SFARI_variants.bed)
! bedtools intersect -a ../data/SFARI_ADs.bed -b ../data/SFARI_variants.bed > ../output/SFARI_variants_in_ADs.bed
# Running on command line on savio instead of in notebook

In [19]:
# Need to sort because large
# please presort your data by chromosome and then by start position

In [20]:
! sortBed -i ../data/SFARI_ADs.bed > ../data/sorted_SFARI_ADs.bed

In [21]:
! sortBed -i ../data/SFARI_variants.bed > ../data/sorted_SFARI_variants.bed

In [22]:
! bedtools intersect -a ../data/sorted_SFARI_ADs.bed -b ../data/sorted_SFARI_variants.bed -sorted > ../output/SFARI_variants_in_ADs.bed

---

In [23]:
# 5. Use bedtools intersect to compare 3 and 4 
# (SFARI_TFs_without_ADs.bed vs. SFARI_variants.bed)
# Running on command line on savio instead of in notebook

In [24]:
! sortBed -i ../output/SFARI_TFs_without_ADs.bed > ../output/sorted_SFARI_TFs_without_ADs.bed

In [25]:
! bedtools intersect -a ../output/sorted_SFARI_TFs_without_ADs.bed -b ../data/sorted_SFARI_variants.bed -sorted > ../output/SFARI_variants_not_in_ADs.bed
