In [1]:
import pandas as pd

# TUBINGEN
For Tübingen, extract all unique hgvs/vcf and annotate with VEP to get hgvs/vcf 

# SACS

In [4]:
df_sacs = pd.read_csv("SACS/WP1/P1_SACS_raw.csv", sep="\t")

In [6]:
df_sacs.columns

Index(['Submitting institution\n(select from dropdown)', 'local subject ID',
       'local family ID', 'PROSPAX subject ID',
       'NGS database ID (e.g. GENESIS, GPAP, etc)',
       'Main phenotype \n(select from dropdown)',
       'Case solved? \n(solved SACS / solved other gene / unsure / not solved)',
       'Variant ID \n(consecutive number)', 'Gene',
       'Chromosome\n(genome build hg19 / GRCh37)',
       'Position\n(use vcf format)', 'Reference allele', 'Alternative allele',
       'Transcript\n(use reference transcript ENST00000382298.3)',
       'cDNA change (e.g. ', 'Protein position', 'Het / Hom',
       'Compound heterozygous with Variant ID\n(enter variant ID, 'N/A' or 'unknown')',
       'PaxGene available?\n(yes / no / unknown)',
       'PBMC available?\n(yes / no / unknown)',
       'Fibroblasts available?\n(yes / no / unknown)', 'Comment'],
      dtype='object')

In [42]:
d_var = df_sacs[['Variant ID \n(consecutive number)', 
       'Chromosome\n(genome build hg19 / GRCh37)',
       'Position\n(use vcf format)', 'Reference allele', 'Alternative allele',
       'Transcript\n(use reference transcript ENST00000382298.3)',
       'cDNA change (e.g. ', 'Protein position']]

In [43]:
d_var

Unnamed: 0,Variant ID \n(consecutive number),Chromosome\n(genome build hg19 / GRCh37),Position\n(use vcf format),Reference allele,Alternative allele,Transcript\n(use reference transcript ENST00000382298.3),cDNA change (e.g.,Protein position
0,#1,13,23914246.0,C,A,ENST00000382298.3,c.3769G>T,p.Gly1275Ter
1,#2,13,23909431.0,T,A,ENST00000382298.3,c.8584A>T,p.Lys2862Ter
2,#3,13,23913292.0,G,A,ENST00000382298.3,c.4723C>T,p.Arg1575Trp
3,#4,13,23913292.0,G,A,ENST00000382298.3,c.4723C>T,p.Arg1575Trp
4,#5,13,23910851.0,GGT,G,ENST00000382298.3,c.7162_7163del,p.Thr2388Argfs*10
...,...,...,...,...,...,...,...,...
108,#109,13,23932473.0,C,T,ENST00000382298.3,,
109,#110,13,23905042.0,G,A,ENST00000382298.3,,
110,#111,13,23912269.0,CAT,C,ENST00000382298.3,,
111,#112,13,23905042.0,G,A,ENST00000382298.3,,


In [18]:
# get empty hgvs lines
# get empty vcf lines
import numpy as np

empty_hgvs = []
empty_vcf = []
for idx, row in d_var.iterrows():
    if np.isnan(row['Position\n(use vcf format)']):
        empty_vcf.append(row['cDNA change (e.g. '])
    elif np.isnan(row['cDNA change (e.g. ']):
        vcf = "{} {} . {} {} . . .".format(row['Chromosome\n(genome build hg19 / GRCh37)'],
                                                                    int(row['Position\n(use vcf format)']),
                                                                    row['Reference allele'],
                                                                    row['Alternative allele'])
        empty_hgvs.append(vcf)

In [22]:
with open("SACS/WP1/empty_vcf.txt", "w") as f:
    for h in set(empty_vcf):
        f.write(h + "\n")

In [23]:
with open("SACS/WP1/empty_hgvs.txt", "w") as f:
    for h in set(empty_hgvs):
        f.write(h + "\n")


In [46]:
from cyvcf2 import VCF

V_empty_vcf = VCF("SACS/WP1/P1_SACS_empty_VCF.vcf")
V_empty_hgvs = VCF("SACS/WP1/P1_sacs_empty_hgvs.vcf")


for v in V_empty_vcf:
    hgvs = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0].split(":")[1]
    for idx, row in df_sacs.iterrows():
        if row['cDNA change (e.g. ']==hgvs:
            df_sacs.at[idx, 'Position\n(use vcf format)'] = v.POS
            df_sacs.at[idx, 'Reference allele'] = v.REF
            df_sacs.at[idx, 'Alternative allele'] = v.ALT[0]
            
            
for v in V_empty_hgvs:
    hgvs_c = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0].split(":")[1]
    hgvs_p = [tr.split("|")[11] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0]
    
    for idx, row in df_sacs.iterrows():
        if row['Position\n(use vcf format)']==v.POS and row['Reference allele']==v.REF and row['Alternative allele']==v.ALT[0]:
            df_sacs.at[idx, 'cDNA change (e.g. '] = hgvs_c
            df_sacs.at[idx, 'Protein position'] = hgvs_p
            

In [47]:
df_sacs.to_csv("SACS/WP1/P1_SACS_1.tsv", sep="\t", index=False)

In [114]:
list(df_sacs.columns)

Index(['Submitting institution\n(select from dropdown)', 'local subject ID',
       'local family ID', 'PROSPAX subject ID',
       'NGS database ID (e.g. GENESIS, GPAP, etc)',
       'Main phenotype \n(select from dropdown)',
       'Case solved? \n(solved SACS / solved other gene / unsure / not solved)',
       'Variant ID \n(consecutive number)', 'Gene',
       'Chromosome\n(genome build hg19 / GRCh37)',
       'Position\n(use vcf format)', 'Reference allele', 'Alternative allele',
       'Transcript\n(use reference transcript ENST00000382298.3)',
       'cDNA change (e.g. ', 'Protein position', 'Het / Hom',
       'Compound heterozygous with Variant ID\n(enter variant ID, 'N/A' or 'unknown')',
       'PaxGene available?\n(yes / no / unknown)',
       'PBMC available?\n(yes / no / unknown)',
       'Fibroblasts available?\n(yes / no / unknown)', 'Comment',
       'Alternate allele'],
      dtype='object')

# SPG7

In [50]:
df_spg7 = pd.read_csv("SPG7/WP1/P1_SPG7_raw.csv", sep="\t")

df_spg7.columns

Index(['Submitting institution\n(select from dropdown)', 'local subject ID',
       'local family ID', 'PROSPAX subject ID',
       'NGS database ID (e.g. GENESIS, GPAP, etc)',
       'Main phenotype \n(select from dropdown)',
       'Case solved? \n(solved SPG7 / solved other gene / unsure / not solved)',
       'Variant ID \n(consecutive number)', 'Gene',
       'Chromosome\n(genome build hg19 / GRCh37)',
       'Position\n(use vcf format)', 'Reference allele', 'Alternative allele',
       'Transcript\n(use reference transcript ENST00000268704.2)',
       'cDNA change (e.g. ', 'Protein position', 'Het / Hom',
       'Compound heterozygous with Variant ID\n(enter variant ID, 'N/A' or 'unknown')',
       'PaxGene available?\n(yes / no / unknown)',
       'PBMC available?\n(yes / no / unknown)',
       'Fibroblasts available?\n(yes / no / unknown)', 'Comment'],
      dtype='object')

In [56]:
# get empty hgvs lines
# get empty vcf lines
import numpy as np

empty_hgvs = []
empty_vcf = []
for idx, row in df_spg7.iterrows():
    
    if np.isnan(row['Position\n(use vcf format)']):
        empty_vcf.append(row['cDNA change (e.g. '])
    

In [71]:
mask = df_spg7['cDNA change (e.g. '].isnull()

index_missing_hgvs = mask[mask==True].index

for index in index_missing_hgvs:
    pos = int(df_spg7.loc[index, 'Position\n(use vcf format)'])
    ref = df_spg7.loc[index, 'Reference allele']
    alt = df_spg7.loc[index, 'Alternative allele']
    
    v = "16 {} . {} {} . . .\n".format(pos, ref, alt)
    empty_hgvs.append(v)

In [74]:
with open("SPG7/WP1/empty_vcf.txt", "w") as f:
    for h in set(empty_vcf):
        f.write(h + "\n")

In [75]:
with open("SPG7/WP1/empty_hgvs.txt", "w") as f:
    for h in set(empty_hgvs):
        f.write(h.replace(".0",""))

In [77]:
from cyvcf2 import VCF

V_empty_vcf = VCF("SPG7/WP1/p1_spg7_empty_vcf.vcf")
V_empty_hgvs = VCF("SPG7/WP1/p1_spg7_empty_hgvs.vcf")


for v in V_empty_vcf:
    hgvs = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0].split(":")[1]
    for idx, row in df_spg7.iterrows():
        if row['cDNA change (e.g. ']==hgvs:
            df_spg7.at[idx, 'Position\n(use vcf format)'] = v.POS
            df_spg7.at[idx, 'Reference allele'] = v.REF
            df_spg7.at[idx, 'Alternative allele'] = v.ALT[0]
            
            
for v in V_empty_hgvs:
    hgvs_c = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0]
    hgvs_p = [tr.split("|")[11] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0]
    
    for idx, row in df_spg7.iterrows():
        if row['Position\n(use vcf format)']==v.POS and row['Reference allele']==v.REF and row['Alternative allele']==v.ALT[0]:
            df_spg7.at[idx, 'cDNA change (e.g. '] = hgvs_c
            df_spg7.at[idx, 'Protein position'] = hgvs_p
            

In [78]:
df_spg7.to_csv("SPG7/WP1/P1_SPG7_1.tsv", sep="\t", index=False)

# PISA

In [11]:
import pandas as pd
from cyvcf2 import VCF

df_sacs_pisa = pd.read_csv("SACS/P4_pisa/SACS SPG7 Finale.xlsx - SACS.tsv", sep="\t")
df_spg7_pisa = pd.read_csv("SPG7/P4_pisa/SACS SPG7 Finale.xlsx - SPG7.tsv", sep="\t")


df_sacs_pisa["Diagnosis"] = df_sacs_pisa["Diagnosis"].fillna("unsure")
df_spg7_pisa["Diagnosis"] = df_spg7_pisa["Diagnosis"].fillna("unsure")
df_sacs_pisa["Transcript "]

0      NM_014363.6
1      NM_014363.6
2      NM_014363.6
3      NM_014363.6
4      NM_014363.6
          ...     
192    NM_014363.6
193    NM_014363.6
194    NM_014363.6
195    NM_014363.6
196    NM_014363.6
Name: Transcript , Length: 197, dtype: object

In [12]:
df_sacs_pisa.head()

Unnamed: 0,Chromosome,Position,Reference Allele,Sample Allele,Transcript,Transcript Variant,Protein Variant,Genotype,Codice,Phenotype,Diagnosis
0,13,23907968,T,C,NM_014363.6,c.10047A>G,p.S3349S,Het,039C,HSP,unsure
1,13,23915089,G,A,NM_014363.6,c.2926C>T,p.R976C,Het,039C,HSP,unsure
2,13,23928078,C,/,NM_014363.6,c.2094-63delG,,Hom,119C,HSP,unsure
3,13,23906661,CT,/,NM_014363.6,c.11353_11354delAG,p.R3785fs*24,Het,1652B,HSP,SPG7
4,13,23912386,G,A,NM_014363.6,c.5629C>T,p.R1877*,Het,1652B,HSP,unsure


In [13]:
df_spg7_pisa.head()

Unnamed: 0,Chromosome,Position,Reference Allele,Sample Allele,Transcript,Transcript Variant,Protein Variant,Genotype,Codice,Phenotype,Diagnosis
0,16,89574945.0,G,A,NM_003119.4,c.120G>A,p.G40G,Het,958C,HSP,unsure
1,16,89619523.0,C,G,NM_003119.4,c.1916C>G,p.S639C,Het,958C,HSP,unsure
2,16,89595897.0,TG,/,NM_003119.4,c.773_774delTG,p.V258fs*30,Het,701C,HSP,SPG7
3,16,89592857.0,C,T,NM_003119.4,c.739C>T,p.R247*,Het,446B,HSP,SPG7
4,16,89598369.0,G,A,NM_003119.4,c.1045G>A,p.G349S,Het,446B,HSP,unsure


In [20]:
df_sacs_pisa

array(['NM_014363.6', 'NM_014363.9'], dtype=object)

In [22]:
df_spg7_pisa["Transcript"].unique()

array(['NM_003119.4'], dtype=object)

In [16]:
def get_hgvs_ref(row, tr_id):
    for i in range(len(row["Transcript "].split(";"))):
        if row["Transcript "].strip().startswith(tr_id):
            variants = row["Transcript Variant"].split(";")
            return "ENST00000382298.3:" + variants[i].strip() + "\n"
    
pisa_sacs_hgvs = []

for idx, row in df_sacs_pisa.iterrows():
    hgvs = get_hgvs_ref(row, "NM_014363")
    patient = row["Codice"]
    diagnosis = row["Diagnosis"]
    phenotype = row["Phenotype"]
    genotype  = row["Genotype"]
    
    
    
    pisa_sacs_hgvs.append(( patient, diagnosis, phenotype, genotype, hgvs))
    
    
    

In [17]:
len(pisa_sacs_hgvs)

197

In [18]:
def get_hgvs_ref(row, tr_id):
    for i in range(len(row["Transcript"].split(";"))):
        if row["Transcript"].strip().startswith(tr_id):
            variants = row["Transcript Variant"].split(";")
            return "ENST00000268704.2:" + variants[i].strip() + "\n"
    
pisa_spg7_hgvs = []

for idx, row in df_spg7_pisa.iterrows():
    hgvs = get_hgvs_ref(row, "NM_003119")
    patient = row["Codice"]
    diagnosis = row["Diagnosis"]
    phenotype = row["Phenotype"]
    genotype  = row["Genotype"]
    
    
    
    pisa_spg7_hgvs.append(( patient, diagnosis, phenotype, genotype, hgvs))
    
    

In [19]:
len(pisa_spg7_hgvs)

298

In [23]:
pisa_sacs_hgvs_c = list(set([entry[4] for entry in pisa_sacs_hgvs]))
pisa_spg7_hgvs_c = list(set([entry[4] for entry in pisa_spg7_hgvs]))


In [145]:
len(pisa_sacs_hgvs_c)

129

In [147]:
len(pisa_spg7_hgvs_c)

164

In [24]:
with open("SACS/P4_pisa/sacs_hgvs_2_vep.txt", "w") as f:
    for h in pisa_sacs_hgvs_c:
        f.write(h)
        
with open("SPG7/P4_pisa/spg7_hgvs_2_vep.txt", "w") as f:
    for h in pisa_spg7_hgvs_c:
        f.write(h)

In [25]:
from cyvcf2 import VCF

V_spg7_hgvs = VCF("SPG7/P4_pisa/P4_pisa_spg7_vep_v2.vcf")
V_sacs_hgvs = VCF("SACS/P4_pisa/P4_pisa_sacs_vep_v2.vcf")


In [117]:
pisa_sacs_hgvs[1]

('039C', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.2926C>T\n')

In [151]:
count = 0 
for line in V_sacs_hgvs:
    count+=1
    
print(count)
count2=0
for line in V_spg7_hgvs:
    count2+=1
print(count2)

128
156


In [29]:
pisa_sacs_hgvs

[('039C', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.10047A>G\n'),
 ('039C', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.2926C>T\n'),
 ('119C', 'unsure', 'HSP', 'Hom', 'ENST00000382298.3:c.2094-63delG\n'),
 ('1652B', 'SPG7', 'HSP', 'Het', 'ENST00000382298.3:c.11353_11354delAG\n'),
 ('1652B', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.5629C>T\n'),
 ('1137B', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.10982C>T\n'),
 ('1137B', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.2080G>A\n'),
 ('543C', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.7354T>C\n'),
 ('543C', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.4878C>T\n'),
 ('543C', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.260-54G>C\n'),
 ('733B', 'SPG7', 'HSP', 'Het', 'ENST00000382298.3:c.6557_6559delGTA\n'),
 ('733B', 'unsure', 'HSP', 'Het', 'ENST00000382298.3:c.9508C>T\n'),
 ('730B',
  'unsure',
  'HSP',
  'Hom?',
  'ENST00000382298.3:c.2186-622_2186-621insA\n'),
 ('1864A', 'unsure', 'HSP', 'Hom?', 'ENST00000382298.3:c.2186-632del

In [28]:
# sacs 
V_sacs_hgvs = VCF("SACS/P4_pisa/P4_pisa_sacs_vep_v2.vcf")


lines_f = []
for v in V_sacs_hgvs:
    hgvs_c = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0]
    hgvs_p = [tr.split("|")[11] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0]
    chrom = v.CHROM
    pos = v.POS
    ref = v.REF
    alt = v.ALT[0]
    
    
    for entry in pisa_sacs_hgvs:
        if entry[4].strip()==hgvs_c:
            sub = "P4 - Pisa"
            local_id = entry[0]
            f_id = "F_" + local_id
            prospax = ""
            ngs_db = "Pisa Diagnostic Pipeline"
            main_phe = entry[2]
            case_solved = entry[1]
            v_id = ""
            tr = "ENST00000382298.3"
            hgvs_c_f = hgvs_c.split(":")[1]
            genotype = entry[3]
            
            cmpd = ""
            pax = ""
            pbmc = ""
            fibr = ""
            comment = ""
            
            lines_f.append([sub, local_id, f_id, prospax, ngs_db, main_phe, case_solved,
                          v_id, "SACS", chrom, pos, ref, alt, tr, hgvs_c_f, hgvs_p, genotype, cmpd,
                          pbmc, pax, fibr, comment])


In [187]:
def create_row_manual(sub, local_id, f_id, ngs_db, main_phe, case_solved,
                           gene, chrom, pos, ref, alt, tr, hgvsc, hgvsp, genotype, v_id="", prospax="",cmpd="",
                          pbmc="", pax="", fibr="", comment=""):
    return [sub, local_id, f_id, prospax, ngs_db, main_phe, case_solved,
                          v_id, gene, chrom, pos, ref, alt, tr, hgvsc, hgvsp, genotype, cmpd,
                          pbmc, pax, fibr, comment]



In [188]:
# manual variant
missing = [h for h in pisa_sacs_hgvs if h[4].split(":")[1].strip() not in list(df_p4_sacs['cDNA change (e.g. '].unique())]
for l in missing:
    n_row = create_row_manual("P4 - Pisa", l[0], "F_"+l[0], "Pisa Diagnostic Pipeline", l[2],
                             l[1], "SACS", "13", "", "", "", "ENST00000382298.3", l[4],
                             "", l[3])
    lines_f.append(n_row)

In [189]:
df_p4_sacs = pd.DataFrame(lines_f, columns=list(df_sacs.columns)[:-1])

In [190]:
df_p4_sacs.drop_duplicates(inplace=True)

In [191]:
df_p4_sacs.shape

(197, 22)

In [208]:
df_p4_sacs.to_csv("SACS/P4_pisa/P4_SACS_variant_collection.v1.tsv", sep="\t", index=False)

In [177]:
[h for h in pisa_sacs_hgvs_c if h.split(":")[1].strip() not in list(df_p4_sacs['cDNA change (e.g. '].unique())]

['ENST00000382298.3:c.259+74_259+76delTTT\n',
 'ENST00000382298.3:c.2186-632delA\n',
 'ENST00000382298.3:c.21-16304_21-16303delGC\n',
 'ENST00000382298.3:c.2186-622_2186-621insA\n',
 'ENST00000382298.3:c.2094-63delG\n',
 'ENST00000382298.3:c.2216A>G/c.4466A>G\n',
 'ENST00000382298.3:c.-501-2797dupA\n',
 'ENST00000382298.3:c.5151dupA\n',
 'ENST00000382298.3:c.6393T>C ?\n',
 'ENST00000382298.3:c.-502+6974_-502+6975delTT\n',
 'ENST00000382298.3:c.6557_6559delGTA\n',
 'ENST00000382298.3:c.21-16541_21-16540delCA\n',
 'ENST00000382298.3:c.20+13816_20+13820delGAGAT\n',
 'ENST00000382298.3:c.11353_11354delAG\n']

In [202]:
# spg7
V_spg7_hgvs = VCF("SPG7/P4_pisa/P4_pisa_spg7_vep_v2.vcf")

lines_f_spg7 = []
for v in V_spg7_hgvs:
    hgvs_c = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0]
    hgvs_p = [tr.split("|")[11] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0]
    
    chrom = v.CHROM
    pos = v.POS
    ref = v.REF
    alt = v.ALT[0]
    
    for entry in pisa_spg7_hgvs:
        if entry[4].strip()==hgvs_c:
            sub = "P4 - Pisa"
            local_id = entry[0]
            f_id = "F_" + local_id
            prospax = ""
            ngs_db = "Pisa Diagnostic Pipeline"
            main_phe = entry[2]
            case_solved = entry[1]
            v_id = ""
            tr = "ENST00000268704.2"
            hgvs_c_f = hgvs_c.split(":")[1]
            genotype = entry[3]
            
            cmpd = ""
            pax = ""
            pbmc = ""
            fibr = ""
            comment = ""
            
            lines_f_spg7.append([sub, local_id, f_id, prospax, ngs_db, main_phe, case_solved,
                          v_id, "SPG7", chrom, pos, ref, alt, tr, hgvs_c_f, hgvs_p, genotype, cmpd,
                          pbmc, pax, fibr, comment])


In [203]:
df_p4_spg7 = pd.DataFrame(lines_f_spg7, columns=list(df_sacs.columns)[:-1])
df_p4_spg7.drop_duplicates(inplace=True)
df_p4_spg7.shape

(238, 22)

In [201]:
len([h for h in pisa_spg7_hgvs if h[4].split(":")[1].strip() not in list(df_p4_spg7['cDNA change (e.g. '].unique())])

55

In [205]:
# manual variant
missing = [h for h in pisa_spg7_hgvs if h[4].split(":")[1].strip() not in list(df_p4_spg7['cDNA change (e.g. '].unique())]
for l in missing:
    n_row = create_row_manual("P4 - Pisa", l[0], "F_"+l[0], "Pisa Diagnostic Pipeline", l[2],
                             l[1], "SPG7", "16", "", "", "", "ENST00000268704.2", l[4],
                             "", l[3])
    lines_f_spg7.append(n_row)

In [206]:
df_p4_spg7 = pd.DataFrame(lines_f_spg7, columns=list(df_spg7.columns))
df_p4_spg7.drop_duplicates(inplace=True)
df_p4_spg7.shape

(293, 22)

In [207]:
df_p4_spg7.to_csv("SPG7/P4_pisa/P4_SPG7_variant_collection.v1.tsv", sep="\t", index=False)

In [216]:
V_spg7_hgvs = VCF("SPG7/P4_pisa/P4_pisa_spg7_vep_v2.vcf")
ids = [v.ID for v in V_spg7_hgvs]
miss = [h for h in pisa_spg7_hgvs if h[4].strip() not in ids]

In [217]:
miss

[('245B', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.-1G>A\n'),
 ('052A', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.1325-60A>G\n'),
 ('1496A', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.*38G>T\n'),
 ('2110T', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.1325-20C>T\n'),
 ('896X', 'unsure', 'HA', 'Het', 'ENST00000268704.2:c.51+751C>G\n'),
 ('2781X', 'KCNA2', 'HA', 'Het', 'ENST00000268704.2:c.51+751C>G\n'),
 ('694C', 'unsure', 'HSP', 'Hom', 'ENST00000268704.2:c.51+537G>C\n'),
 ('873C', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.51+18C>G\n'),
 ('873C', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.992A>G\n'),
 ('971C', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.51+18C>G\n'),
 ('971C', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.992A>G\n'),
 ('1594B', 'unsure', 'HSP', 'Hom', 'ENST00000268704.2:c.992A>G\n'),
 ('1594B', 'unsure', 'HSP', 'Hom', 'ENST00000268704.2:c.51+18C>G\n')]

In [218]:
missing

[('701C', 'SPG7', 'HSP', 'Het', 'ENST00000268704.2:c.773_774delTG\n'),
 ('2418A',
  'unsure',
  'HSP',
  'Hom?',
  'ENST00000268704.2:c.2181+155_2181+161delGTGTTCC\n'),
 ('245B', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.-1G>A\n'),
 ('2711A',
  'unsure',
  'HSP',
  'Hom?',
  'ENST00000268704.2:c.184-19_184-18delGT\n'),
 ('052A', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.1325-60A>G\n'),
 ('1496A', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.*38G>T\n'),
 ('1443X',
  'unsure',
  'HSP',
  'Hom?',
  'ENST00000268704.2:c.1780-17_1780-15delTCT\n'),
 ('2110T', 'unsure', 'HSP', 'Het', 'ENST00000268704.2:c.1325-20C>T\n'),
 ('1825X',
  'unsure',
  'HSP',
  'Hom?',
  'ENST00000268704.2:c.1324+136_1324+137insA\n'),
 ('896X', 'unsure', 'HA', 'Het', 'ENST00000268704.2:c.51+751C>G\n'),
 ('2781X', 'KCNA2', 'HA', 'Het', 'ENST00000268704.2:c.51+751C>G\n'),
 ('2446V', 'SPG7', 'HA', 'Hom', 'ENST00000268704.2:c.73_80delCCAGGCCC\n'),
 ('194Z',
  'unsure',
  'HA',
  'Hom',
  'ENST00000268704.2:c.1780-1

# Nijmegen

In [223]:
df_sacs_nij = pd.read_csv("SACS/P3_nijmegen/SACS varianten MPEN BvdW 25-11.xlsx - Blad1.tsv", sep="\t")
df_spg7_nij = pd.read_csv("SPG7/P3_nijmegen/SPG7 varianten MPEN BvdW 25-11.xlsx - Blad1.tsv", sep="\t")

df_sacs_nij['Number of patients with this variant'] = df_sacs_nij['Number of patients with this variant'].fillna(' ')
df_spg7_nij['Number of patients with this variant'] = df_spg7_nij['Number of patients with this variant'].fillna(' ')

In [219]:
df_sacs_nij.columns

Index(['Variant (genomic)', 'Variant ((non)coding)', 'Variant (rna/protein)',
       'Classification', 'Phenotype', 'Number of patients with this variant',
       'Other variant in this gene (if applicable)'],
      dtype='object')

In [220]:
df_spg7_nij.columns

Index(['Variant (genomic)', 'Variant ((non)coding)', 'Variant (rna/protein)',
       'Classification', 'Phenotype', 'Number of patients with this variant',
       'Other variant in this gene (if applicable)'],
      dtype='object')

In [229]:
nij_raw_rows_sacs = []
for idx, row in df_sacs_nij.iterrows():
    hgvsg = row['Variant (genomic)']
    main_phenotype = row['Phenotype']
    comment = "nb="+row['Number of patients with this variant']+";"+"comments="+row['Other variant in this gene (if applicable)']
    
    nij_raw_rows_sacs.append([hgvsg, main_phenotype, comment])

In [230]:
nij_raw_rows_spg7 = []
for idx, row in df_spg7_nij.iterrows():
    hgvsg = row['Variant (genomic)']
    main_phenotype = row['Phenotype']
    comment = "nb="+row['Number of patients with this variant']+";"+"comments="+row['Other variant in this gene (if applicable)']
    
    nij_raw_rows_spg7.append([hgvsg, main_phenotype, comment])

In [231]:
with open("SACS/P3_nijmegen/SACS_2_vep.txt", "w") as f:
    for e in nij_raw_rows_sacs:
        f.write(e[0] + "\n")
        
with open("SPG7/P3_nijmegen/SPG7_2_vep.txt", "w") as f:
    for e in nij_raw_rows_spg7:
        f.write(e[0] + "\n")

In [235]:
V_nij_sacs = VCF("SACS/P3_nijmegen/P3_SACS_fromVep.vcf.vcf")


nij_lines_f_sacs = []
for v in V_nij_sacs:
    hgvs_c = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0]
    hgvs_p = [tr.split("|")[11] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000382298.3"][0]
    
    chrom = v.CHROM
    pos = v.POS
    ref = v.REF
    alt = v.ALT[0]
    
    idx = v.ID
    
    for entry in nij_raw_rows_sacs:
        if entry[0]==idx:
            sub = "P3 - Nijgemen"
            local_id = entry[0]
            f_id = "F_" + local_id
            prospax = ""
            ngs_db = "Nijgemen Diagnostic Pipeline"
            main_phe = entry[1]
            case_solved = ""
            v_id = ""
            tr = "ENST00000382298.3"
            hgvs_c_f = hgvs_c.split(":")[1]
            genotype = ""
            
            cmpd = ""
            pax = ""
            pbmc = ""
            fibr = ""
            comment = entry[2]
            
            nij_lines_f_sacs.append([sub, local_id, f_id, prospax, ngs_db, main_phe, case_solved,
                          v_id, "SACS", chrom, pos, ref, alt, tr, hgvs_c_f, hgvs_p, genotype, cmpd,
                          pbmc, pax, fibr, comment])

In [238]:
nij_sacs_df = pd.DataFrame(nij_lines_f_sacs, columns=df_p4_sacs.columns)

nij_sacs_df.to_csv("SACS/P3_nijmegen/P3_Nijmegen_variant_collection.v1.tsv", sep="\t", index=False)

In [244]:
V_nij_spg7 = VCF("SPG7/P3_nijmegen/P3_SPG7_fromVep.vcf.vcf")


nij_lines_f_spg7 = []
for v in V_nij_spg7:
    hgvs_c = [tr.split("|")[10] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0]
    hgvs_p = [tr.split("|")[11] for tr in v.INFO["CSQ"].split(",") if tr.split("|")[6] =="ENST00000268704.2"][0]
    
    chrom = v.CHROM
    pos = v.POS
    ref = v.REF
    alt = v.ALT[0]
    
    idx = v.ID
    
    for entry in nij_raw_rows_spg7:
        if entry[0]==idx:
            sub = "P3 - Nijgemen"
            local_id = entry[0]
            f_id = "F_" + local_id
            prospax = ""
            ngs_db = "Nijgemen Diagnostic Pipeline"
            main_phe = entry[1]
            case_solved = ""
            v_id = ""
            tr = "ENST00000268704.2"
            hgvs_c_f = hgvs_c.split(":")[1]
            genotype = ""
            
            cmpd = ""
            pax = ""
            pbmc = ""
            fibr = ""
            comment = entry[2]
            
            nij_lines_f_spg7.append([sub, local_id, f_id, prospax, ngs_db, main_phe, case_solved,
                          v_id, "SPG7", chrom, pos, ref, alt, tr, hgvs_c_f, hgvs_p, genotype, cmpd,
                          pbmc, pax, fibr, comment])

In [245]:
len(nij_raw_rows_spg7)

34

In [246]:
len(nij_lines_f_spg7)

34

In [247]:
nij_spg7_df = pd.DataFrame(nij_lines_f_spg7, columns=df_p4_spg7.columns)

nij_spg7_df.to_csv("SPG7/P3_nijmegen/P3_Nijmegen_variant_collection.v1.tsv", sep="\t", index=False)

In [100]:
df_spg7_nij

Unnamed: 0,Variant (genomic),Variant ((non)coding),Variant (rna/protein),Classification,Phenotype,Number of patients with this variant,Other variant in this gene (if applicable)
0,Chr16(GRCh37):g.89574828G>A,c.3G>A,p.(Met1?),class 5,Spastic paraplegia,,homozygous
1,Chr16(GRCh37):g.89574859C>T,c.34C>T,p.(Arg12Cys),class 3,Spastic paraplegia,,no 2nd
2,Chr16(GRCh37):g.89574971C>T,c.146C>T,p.(Pro49Leu),class 3,Spastic paraplegia,,no 2nd (not highly conserved)
3,Chr16(GRCh37):g.89579404_89579405insTA,c.335_336insTA,p.(Glu112fs),class 5,spinocerebellar ataxia,,comp het. (with c.1324G>C)
4,Chr16(GRCh37):g.89590648G>T,c.611G>T,p.(Gly204Val),class 3,spastic paraplegia,,no 2nd
5,Chr16(GRCh37):g.89592812del,c.694del,p.(Glu232fs),class 5,"spastic ataxia, cerebral visual disturbances",,comp het. (with c.1529C>T)
6,Chr16(GRCh37):g.89595987dup,c.861dup,p.((Asn288*),class 5,Spastic paraplegia / progressive bipyramidal s...,4x,no 2nd / comp het. (c.1045G>A) / comp het. del...
7,Chr16(GRCh37):g.89595989dup,c.861+2dup,r.(spl?),class 5,Spastic paraplegia,3x,2x comp het. (c.1045G>A) / No 2nd
8,Chr16(GRCh37):g.89598270_89598370del,c.988-42_1046del,r.(spl?),class 5,Spastic paraplegia,,comp het. (c.1454_1462del)
9,Chr16(GRCh37):g.89598369G>A,c.1045G>A,p.(Gly349Ser),class 4,Spastic paraplegia,,found many times (>20)


---------------

In [6]:
import requests, sys, json
from pprint import pprint

def fetch_endpoint(server, request, content_type):

    r = requests.get(server+request, headers={ "Accept" : content_type})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    if content_type == 'application/json':
        return r.json()
    else:
        return r.text

def fetch_endpoint_POST(server, request, data, content_type):

    r = requests.post(server+request,
                      headers={ "Accept" : content_type},
                      data=data )

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    if content_type == 'application/json':
        return r.json()
    else:
        return r.text

# define the server, extension and content type
server = "http://rest.ensembl.org/"
con = "application/json"
vep_ext = "vep/homo_sapiens/hgvs/vcf=-"

# create the list of HGVS annotations
hgvs = ["SACS:c.2094-63delG","SACS:c.11353_11354delAG","SACS:c.6557_6559delGTA",
        "SACS:c.2186-622_2186-621insA","SACS:c.2186-632delA","SACS:c.259+74_259+76delTTT",
        "SACS:c.5151dupA","SACS:c.2216A>G/c.4466A>G","SACS:c.6393T>C","SACS:c.21-16541_21-16540delCA",
        "SACS:c.-501-2797dupA","SACS:c.21-16304_21-16303delGC","SACS:c.21-16541_21-16540delCA",
        "SACS:c.20+13816_20+13820delGAGAT","SACS:c.21-16304_21-16303delGC","SACS:c.21-16304_21-16303delGC",
        "SACS:c.-502+6974_-502+6975delTT"
]

# convert the list into json format
hgvs_json = json.dumps({ "hgvs_notations" : hgvs })

# run the query
post_vep = fetch_endpoint_POST(server, vep_ext, hgvs_json, con)



In [7]:
post_vep[0]

{'id': 'SACS:c.2094-63delG',
 'end': 23353939,
 'input': 'SACS:c.2094-63delG',
 'most_severe_consequence': 'intron_variant',
 'transcript_consequences': [{'gene_symbol': 'SACS',
   'gene_id': 'ENSG00000151835',
   'variant_allele': '-',
   'strand': -1,
   'gene_symbol_source': 'HGNC',
   'transcript_id': 'ENST00000382292',
   'consequence_terms': ['intron_variant'],
   'impact': 'MODIFIER',
   'biotype': 'protein_coding',
   'hgnc_id': 'HGNC:10519'},
  {'strand': -1,
   'variant_allele': '-',
   'gene_symbol': 'SACS',
   'gene_id': 'ENSG00000151835',
   'hgnc_id': 'HGNC:10519',
   'gene_symbol_source': 'HGNC',
   'consequence_terms': ['intron_variant'],
   'transcript_id': 'ENST00000402364',
   'impact': 'MODIFIER',
   'biotype': 'protein_coding'},
  {'hgnc_id': 'HGNC:10519',
   'biotype': 'protein_coding',
   'consequence_terms': ['intron_variant'],
   'transcript_id': 'ENST00000423156',
   'gene_symbol_source': 'HGNC',
   'impact': 'MODIFIER',
   'strand': -1,
   'variant_allele': '

In [None]:
# move through the results
for variant in post_vep:
    
    # get the data
    input = variant['input']
    colocated_list = []
    for colocated in variant['colocated_variants']:
        colocated_list.append(colocated['id'])  
    print (input + ": " + (', '.join(colocated_list)))