In [20]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [22]:
# Trying to filter variants to just SNP then to gnomAD format to use ProtVar

In [23]:
SFARI_variants = pd.read_csv('../raw_files/iWES_v2.gatk.pvcf_variants.tsv', sep = '\t')
SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af
0,chr1,69081,G,C,0.000187
1,chr1,69130,C,G,9e-06
2,chr1,69134,A,G,4.2e-05
3,chr1,69135,A,G,5e-06
4,chr1,69149,T,A,9e-06
...,...,...,...,...,...
12443148,chrY,25044036,CTGG,"CTGAG,C","1.9e-05,5e-06"
12443149,chrY,25044040,T,A,1.9e-05
12443150,chrY,25044041,CT,C,1.9e-05
12443151,chrY,25044046,G,A,1.4e-05


In [24]:
SFARI_variants["chrom"] = SFARI_variants["chrom"].str.replace("chr", "")
SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af
0,1,69081,G,C,0.000187
1,1,69130,C,G,9e-06
2,1,69134,A,G,4.2e-05
3,1,69135,A,G,5e-06
4,1,69149,T,A,9e-06
...,...,...,...,...,...
12443148,Y,25044036,CTGG,"CTGAG,C","1.9e-05,5e-06"
12443149,Y,25044040,T,A,1.9e-05
12443150,Y,25044041,CT,C,1.9e-05
12443151,Y,25044046,G,A,1.4e-05


In [25]:
SFARI_variants["alt"] = SFARI_variants["alt"].str.split(",")
SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af
0,1,69081,G,[C],0.000187
1,1,69130,C,[G],9e-06
2,1,69134,A,[G],4.2e-05
3,1,69135,A,[G],5e-06
4,1,69149,T,[A],9e-06
...,...,...,...,...,...
12443148,Y,25044036,CTGG,"[CTGAG, C]","1.9e-05,5e-06"
12443149,Y,25044040,T,[A],1.9e-05
12443150,Y,25044041,CT,[C],1.9e-05
12443151,Y,25044046,G,[A],1.4e-05


In [26]:
SFARI_variants["af"] = SFARI_variants["af"].str.split(",")
SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af
0,1,69081,G,[C],[0.000187]
1,1,69130,C,[G],[9e-06]
2,1,69134,A,[G],[4.2e-05]
3,1,69135,A,[G],[5e-06]
4,1,69149,T,[A],[9e-06]
...,...,...,...,...,...
12443148,Y,25044036,CTGG,"[CTGAG, C]","[1.9e-05, 5e-06]"
12443149,Y,25044040,T,[A],[1.9e-05]
12443150,Y,25044041,CT,[C],[1.9e-05]
12443151,Y,25044046,G,[A],[1.4e-05]


In [27]:
expanded_SFARI_variants = SFARI_variants.explode(['alt', 'af']).reset_index(drop = True)
expanded_SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af
0,1,69081,G,C,0.000187
1,1,69130,C,G,9e-06
2,1,69134,A,G,4.2e-05
3,1,69135,A,G,5e-06
4,1,69149,T,A,9e-06
...,...,...,...,...,...
14335426,Y,25044041,CT,C,1.9e-05
14335427,Y,25044046,G,A,1.4e-05
14335428,Y,25044050,ATGG,A,1.4e-05
14335429,Y,25044050,ATGG,ATAG,5e-06


In [28]:
expanded_SFARI_variants["ref_len"] = expanded_SFARI_variants["ref"].str.len()
expanded_SFARI_variants["alt_len"] = expanded_SFARI_variants["alt"].str.len()

In [29]:
SNP_expanded_SFARI_variants = expanded_SFARI_variants[expanded_SFARI_variants["ref_len"] == 1]
SNP_expanded_SFARI_variants = SNP_expanded_SFARI_variants[SNP_expanded_SFARI_variants["alt_len"] == 1]

In [30]:
SNP_expanded_SFARI_variants

Unnamed: 0,chrom,pos,ref,alt,af,ref_len,alt_len
0,1,69081,G,C,0.000187,1,1
1,1,69130,C,G,9e-06,1,1
2,1,69134,A,G,4.2e-05,1,1
3,1,69135,A,G,5e-06,1,1
4,1,69149,T,A,9e-06,1,1
...,...,...,...,...,...,...,...
14335420,Y,25044016,G,A,9e-06,1,1
14335421,Y,25044034,A,C,5e-06,1,1
14335422,Y,25044035,T,C,1.9e-05,1,1
14335425,Y,25044040,T,A,1.9e-05,1,1


In [31]:
SNP_expanded_SFARI_variants["vcf"] = SNP_expanded_SFARI_variants["chrom"].astype(str) + "-" + \
    SNP_expanded_SFARI_variants["pos"].astype(str) + "-" + \
    SNP_expanded_SFARI_variants["ref"] + "-" + \
    SNP_expanded_SFARI_variants["alt"]

In [33]:
SNP_expanded_SFARI_variants[["vcf"]]

Unnamed: 0,vcf
0,1-69081-G-C
1,1-69130-C-G
2,1-69134-A-G
3,1-69135-A-G
4,1-69149-T-A
...,...
14335420,Y-25044016-G-A
14335421,Y-25044034-A-C
14335422,Y-25044035-T-C
14335425,Y-25044040-T-A


In [34]:
SNP_expanded_SFARI_variants[["vcf"]].to_csv("../raw_files/expanded_iWES_v2_SNP_variants_protvar.txt", 
                                                 header=None, index=None, sep=' ', mode='a')

In [35]:
# 10 MB limit, so saving as 20 files

In [36]:
# ! mkdir ../raw_files/SNP_protvar

In [37]:
SNP_expanded_SFARI_variants[["vcf"]].head()

Unnamed: 0,vcf
0,1-69081-G-C
1,1-69130-C-G
2,1-69134-A-G
3,1-69135-A-G
4,1-69149-T-A
