In [1]:
!pip install seaborn
%matplotlib notebook

You should consider upgrading via the '/Users/dilution/.pyenv/versions/3.7.7/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [1]:
import os

In [2]:
from cyvcf2 import VCF
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
# prospax file collection
df_spg7 = pd.read_csv("SPG7/SPG7_all_partners.tsv", sep="\t")
# variant list formated
df_spg7_extended = pd.read_csv("SPG7/SPG7.prospax.extended.annotated.tsv", sep="\t")
# spg7_vcf_clinvar = VCF("ext_data/clinvar/05_2021/SPG7.clinvar.annotated.vcf")
df_spg7_clinvar = pd.read_csv("SPG7/SPG7.clinvar.extended.annotated.tsv", sep="\t")
# gnomad spg7
df_gnom_ad = pd.read_csv("SPG7/SPG7.gnomad.tsv", sep="\t")
# uniprot_spg7 
uniprot_var = pd.read_csv("SPG7/SPG7_uniprot_annotated.tsv", sep="\t")

# Pre-Analysis

Data cleaning

## Change df_sacs columns name

In [3]:
columns = [
    "submitter_id",
    "local_case_id",
    "local_family_id",
    "prospax_case_id",
    "ngs_database_id",
    "main_phenotype",
    "case_status",
    "id",
    "gene",
    "chrom",
    "pos",
    "ref",
    "alt",
    "transcript_id",
    "cdna",
    "prot_change",
    "genotype",
    "compound_het_id_s",
    "paxgene_availability",
    "pbmc_availability",
    "fibroblasts_availability",
    "comments",
    "Variant_based_id",
]

df_spg7.columns = columns

## Simplify mutation type

In [4]:
def regroup_csq(x):
    if x in ["3_prime_UTR_variant", "5_prime_UTR_variant"]:
        return "UTR"
    elif x in ["inframe_insertion", "inframe_deletion", 'inframe_deletion&splice_region_variant']:
        return "Inframe_Indel"
    elif x in ["intron_variant" ]:
        return "Intron"
    elif x in ["missense_variant", "missense_variant&splice_region_variant"]:
        return "Missense"
    elif x in ['splice_region_variant&intron_variant',
              'splice_region_variant&synonymous_variant', 'splice_region_variant&5_prime_UTR_variant']:
        return "Splice_region"
    elif x in ["splice_acceptor_variant", "splice_donor_variant", "splice_donor_variant&intron_variant", 'splice_donor_variant&coding_sequence_variant&intron_variant',
              'splice_acceptor_variant&coding_sequence_variant&intron_variant', 
              'splice_acceptor_variant&coding_sequence_variant']:
        return "Canonical_splice"
    elif x in ["start_lost", "stop_lost", "stop_gained", "stop_gained&splice_region_variant", 'stop_gained&inframe_insertion',
               'stop_gained&protein_altering_variant']:
        return "Nonsense"
    elif x in ['protein_altering_variant', "frameshift_variant", "frameshift_variant&stop_lost",
               "stop_gained&frameshift_variant", 'splice_donor_variant&splice_acceptor_variant&frameshift_variant&stop_lost&intron_variant',
               'frameshift_variant&splice_region_variant', 'frameshift_variant&splice_region_variant']:
        return "Frameshift"
    elif x in ["synonymous_variant"]:
        return "Synonymous"
    elif x in ['upstream_gene_variant']:
        return "Intergene"
    
    
df_spg7_clinvar["csq_minimal"] = df_spg7_clinvar["csq"].apply(lambda x: regroup_csq(x))
df_spg7_extended["csq_minimal"] = df_spg7_extended["csq"].apply(lambda x: regroup_csq(x))
df_gnom_ad["csq_minimal"] = df_gnom_ad["mutation_type"].apply(lambda x: regroup_csq(x))

## ADD variant_based_id column to files

In [5]:
df_spg7_clinvar["Variant_based_id"] = df_spg7_clinvar[["chrom", "pos", "ref", "alt"]].apply(lambda x: "-".join(x.values.astype(str)), axis=1)
df_spg7_extended["Variant_based_id"] = df_spg7_extended[["chrom", "pos", "ref", "alt"]].apply(lambda x: "-".join(x.values.astype(str)), axis=1)
df_gnom_ad.rename(columns={"variant_id":"Variant_based_id"}, inplace=True)


## Simplify case status 
unsure and not solved could be re-grouped

In [6]:
# get variant set(ids) from different case status
solved_ids = set(df_spg7[df_spg7['case_status']=="solved SPG7"]["Variant_based_id"])
unsure_notsolved_ids = set(df_spg7[df_spg7['case_status'].isin(["not solved", "unsure"])]["Variant_based_id"])
solved_other_ids = set(df_spg7[df_spg7['case_status']=="solved other gene"]["Variant_based_id"])

## ADD protein domain info

In [7]:
import hgvs.parser
import numpy as np

def clean_hgvsp(x):
    if x=="n/a" or x=="-1":
        return "n/a"
    else:
        return x.replace("%3D", "=")
        
def short_prot_desc(x):
    if x=="n/a":
        return "n/a"
    else:
        return x.split(":")[1]

def get_prot_pos(x, hp):
    if x=="n/a":
        return -1
    else:
        p = hp.parse_hgvs_variant(x)
        return p.posedit.pos.start.base
    

    
hp = hgvs.parser.Parser()

# list spg7 prospax variant
df_spg7_extended["hgvsp"].fillna("n/a", inplace=True)
df_spg7_extended["hgvsp_clean"] = df_spg7_extended["hgvsp"].apply(lambda x: clean_hgvsp(x))
df_spg7_extended["hgvsp_short"] = df_spg7_extended["hgvsp_clean"].apply(lambda x: short_prot_desc(x))
df_spg7_extended["protein_pos"] = df_spg7_extended["hgvsp_clean"].apply(lambda x: get_prot_pos(x, hp))

# list clinvar variant
df_spg7_clinvar["hgvsp"].fillna("n/a", inplace=True)
df_spg7_clinvar["hgvsp_clean"] = df_spg7_clinvar["hgvsp"].apply(lambda x: clean_hgvsp(x))
df_spg7_clinvar["hgvsp_short"] = df_spg7_clinvar["hgvsp_clean"].apply(lambda x: short_prot_desc(x))
df_spg7_clinvar["protein_pos"] = df_spg7_clinvar["hgvsp_clean"].apply(lambda x: get_prot_pos(x, hp))

# list gnomad variant
df_gnom_ad["hgvsp"].fillna("n/a", inplace=True)
df_gnom_ad["hgvsp_clean"] = df_gnom_ad["hgvsp"].apply(lambda x: clean_hgvsp(x))
df_gnom_ad["hgvsp_short"] = df_gnom_ad["hgvsp_clean"].apply(lambda x: short_prot_desc(x))
df_gnom_ad["protein_pos"] = df_gnom_ad["hgvsp_clean"].apply(lambda x: get_prot_pos(x, hp))


# list gnomad variant
uniprot_var["hgvsp"].fillna("n/a", inplace=True)
uniprot_var["hgvsp_clean"] = uniprot_var["hgvsp"].apply(lambda x: clean_hgvsp(x))
uniprot_var["hgvsp_short"] = uniprot_var["hgvsp_clean"].apply(lambda x: short_prot_desc(x))
uniprot_var["protein_pos"] = uniprot_var["hgvsp_clean"].apply(lambda x: get_prot_pos(x, hp))


In [8]:
spg7_domain = pd.read_csv("SPG7/spg7.bed", sep=" ", header=None)

In [9]:
import bisect

# map of regions
spg7_prot = {
    "0_MTS": (1, 59),
    "1_inter": (60, 142),
    "2_FtsH": (143, 144),
    "3_TM1": (145, 166),
    "4_FtsH": (167, 237),
    "5_inter": (238, 249),
    "6_TM2": (250, 272),
    "7_inter": (273, 305),
    "8_AAA+_ATPase": (306, 481),
    "9_inter": (482, 507),
    "10_AAA+_lid": (508, 543),
    "11_M41_peptidase": (544, 746),
    "12_inter": (746, 796),
    
}

# reduce to categories
ref = {
    "0_inter": (0, [1, 5, 7, 9, 12]),
    "1_MTS": (1, [0]),
    "2_FtsH": (2, [2, 4]),
    "3_TM": (3, [3, 6]),
    "4_AAA+_ATPase": (4, [8]),
    "5_AAA+_lid": (5, [10]),
    "6_M41_peptidase": (6, [11]),
}

def get_domain_name(x, map_dom):
    
    if x==-1:
        return -1
    else:
        intervals = sorted([v[1] for i, v in map_dom.items()])
        intervals_name = [i for i in map_dom]
        domain = bisect.bisect_left(intervals, x)
        domain_name = intervals_name[domain]
        return domain_name

def get_domain_cat(x, map_dom, ref):
    if x==-1:
        return -1
    else:
        intervals = sorted([v[1] for i, v in map_dom.items()])
        # intervals_name = [i for i in map_dom]
        domain = bisect.bisect_left(intervals, x)
        for i, v in ref.items():
            if domain in v[1]:
                return v[0]



In [10]:
# apply to df_var
df_spg7_extended["domain_name"] = df_spg7_extended["protein_pos"].apply(lambda x: get_domain_name(x, spg7_prot))
df_spg7_extended["cat_domain"] = df_spg7_extended["protein_pos"].apply(lambda x: get_domain_cat(x, spg7_prot, ref))

# clinvar
df_spg7_clinvar["domain_name"] = df_spg7_clinvar["protein_pos"].apply(lambda x: get_domain_name(x, spg7_prot))
df_spg7_clinvar["cat_domain"] = df_spg7_clinvar["protein_pos"].apply(lambda x: get_domain_cat(x, spg7_prot, ref))


# gnomad
df_gnom_ad["domain_name"] = df_gnom_ad["protein_pos"].apply(lambda x: get_domain_name(x, spg7_prot))
df_gnom_ad["cat_domain"] = df_gnom_ad["protein_pos"].apply(lambda x: get_domain_cat(x, spg7_prot, ref))

# uniprot
uniprot_var["domain_name"] = uniprot_var["protein_pos"].apply(lambda x: get_domain_name(x, spg7_prot))
uniprot_var["cat_domain"] = uniprot_var["protein_pos"].apply(lambda x: get_domain_cat(x, spg7_prot, ref))


# Preparatory steps




## 1 - Annotate variants uniformely --> (done with VEP)

## 2 - Identify and flag likely pathogenic variants
- All rare truncating variants (nonsense, frameshift, canonical splice)
- All ClinVar likely pathogenic / pathogenic variants


In [11]:
def flag_lp_p(x, list_v_id):
    if x in list_v_id:
        return "y"
    else:
        return "n"

In [12]:
# get truncating mutation 
truncating_type = ['Nonsense', 'Frameshift', 'Canonical_splice']
list_v_id_truncating = df_spg7_extended[df_spg7_extended["csq_minimal"].isin(truncating_type)]["Variant_based_id"].unique()

In [13]:
# get likely pathogenic and pathogenic from clinvar and uniprot
# inner merge clinvar prospax
# assertion considered are: 
# 'reviewed_by_expert_panel',
# 'criteria_provided,_single_submitter',
# 'criteria_provided,_multiple_submitters,_no_conflicts'
inner_merge_clinvar_prospax = pd.merge(df_spg7_clinvar[["Variant_based_id", 'clinsig', 'clinsigconflict','clinsigincl']], df_spg7_extended, how="inner", on=["Variant_based_id"])
lp_p_labels = ['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic']
lp_p_clinvar = inner_merge_clinvar_prospax[inner_merge_clinvar_prospax["clinsig"].isin(lp_p_labels)]["Variant_based_id"].unique()

# inner merge uniprot prospax 
# pathogenic are labeled as 2 in clinsig col
inner_merge_uniprot_prospax = pd.merge(uniprot_var[["Variant_based_id", 'clinsig']], df_spg7_extended, how="inner", on=["Variant_based_id"])
lp_p_uniprot = inner_merge_uniprot_prospax[inner_merge_uniprot_prospax["clinsig"]==3]["Variant_based_id"].unique()


In [14]:
# add gross indels to list of lp_p variants
gross_indels = list(df_spg7[df_spg7["ref"].isin(["dup", "del"])]["Variant_based_id"].unique())

In [15]:
# add 16-89613145-C-T - p.Ala510Val 
manual_addition = ["16-89613145-C-T"]

In [16]:
# merge lists lp/p uniprot / clinvar / truncating variants
# and flag them in variant collection
final_lp_p_id = list(set(list(lp_p_clinvar) + list(lp_p_uniprot) + list(list_v_id_truncating) + gross_indels + manual_addition))
df_spg7_extended["is_lp_p"] = df_spg7_extended["Variant_based_id"].apply(lambda x: flag_lp_p(x, final_lp_p_id))

In [17]:
# a
print("Number of variant being reported as LP/P in clinvar/uniprot: {}".format(len(final_lp_p_id)))

Number of variant being reported as LP/P in clinvar/uniprot: 72


## 3 - Filter remaining variants uniformely
- Keep all likely pathogenic variants
- Filter criteria for remaining variants: 
    - MAF < 0.5%
    - highly conserved (missense only: PhasCons ≥ 0.7, GERP ≥ 2) (to be discussed)

In [18]:
def filter_maf(df, threshold=0.005):
    df_filt = df[(df['thKg']<=threshold) &
                 (df['gnomad_ex']<=threshold) &
                 (df['gnomad_gn']<=threshold)
                ]
    return df_filt

def filter_conservation(df, phastcons_min=0.7, gerp_min=2):
    df_filt = df[((df['gerp_rs']>=gerp_min) | (df['gerp_rs'].isnull())) &
                 ((df['phastcons']>=phastcons_min) | (df['phastcons'].isnull())) 
                ]
    return df_filt

In [19]:
# number of variants not lp/p
df_not_lp_p = df_spg7_extended[df_spg7_extended["is_lp_p"]=="n"]
df_not_lp_p.shape

(244, 56)

In [20]:
# extract the non-LP/P variant from preparatory steps
# filter on MAF<=0.5%
# filter on conservation pref (gerp, phastcons)
df_freq1 = filter_maf(df_not_lp_p)
df_freq_conserver_filtered = filter_conservation(df_freq1)
final_conserved_variant_id = list(df_freq_conserver_filtered["Variant_based_id"].unique())

In [21]:
# flag variants if lp/p - conserved - none
def flag_conserved(x, list_lp_p, list_rare_conserved):
    if x in list_rare_conserved:
        return "rare_conserved"
    elif x in list_lp_p:
        return "lp_p"
    else:
        return "not_conserved"
    
    
df_spg7_extended["is_conserved"] = df_spg7_extended["Variant_based_id"].apply(lambda x: flag_conserved(x, final_lp_p_id, final_conserved_variant_id))

In [22]:
df_freq_conserver_filtered.shape

(158, 56)

# A - Tier 1: Cases carrying 2 likely pathogenic variants

## 4 - Identify cases carrying two (likely) pathogenic variants
- flag cases as “solved”
- filter out all variants in solved cases
- collect phenotypic information, perform segregation analysis, consider functional assays (e.g. as positive controls)


In [23]:
# create unique patient (submitter-localsubjectid)
df_lp_p = df_spg7[df_spg7["Variant_based_id"].isin(final_lp_p_id)]

unique_case_list = set([(row['submitter_id'],row['local_case_id']) 
                    for idx, row in df_lp_p.iterrows()])

In [24]:
def stream_cases(cases, df):
    for case in cases:
        yield((case, df[(df['local_case_id']==case[1]) & \
                 (df['submitter_id']==case[0])]))

# check if patient has one gross dup/del        
def is_large_dup_del(df):
    if "dup" in df['ref'].values:
        return True
    elif "del" in df['ref'].values:
        return True
    else:
        False
        
# check if patient has 2 lp/p variants       
def is_2_lp_p(df, list_lp_p):
    if df[df["Variant_based_id"].isin(list_lp_p)].shape[0]>=2:
        return True
    else:
        return False

# check if patient has 1 lp/p variant + one gross dup/del
def is_lpp_del_dup(df, list_lp_p):
    if df[df["Variant_based_id"].isin(list_lp_p)].shape[0]>1:
        return True
    else:
        return False
    
# check if patient has Hom lp/p variant
def is_lp_p_hom(df, list_lp_p):
    if df[(df["Variant_based_id"].isin(list_lp_p)) &
       (df["genotype"]=="Hom")].empty:
        return False
    else:
        return True
    
def tier1_screen_case(df, list_lp_p):
    # if one variant from lp/p list only variant hom
    if df.shape[0]==1:
        return True
    # if more than one variant in case
    elif df.shape[0]>1:
        # if on variant is a gross del/dup
        if is_large_dup_del(df):
            # if one gross dup/del + one lp/p variant
            if is_lpp_del_dup(df, list_lp_p):
                return True
            else:
                # if just one gross dup/del
                return False
        # if 2 lp/p het / caveats ID27288 + 357-987-169
        elif is_2_lp_p(df, list_lp_p):
            return True
        # if one HOM lp/p variants and other non het/HOM non-lp/p variants
        elif is_lp_p_hom(df, list_lp_p):
            return True
        else:
            return False
            
            
        
        

### Test

In [24]:
test_dup_del_lp = df_spg7[df_spg7["local_case_id"]=="FSP-SAL-DEL-1469-001"]    
test_2_lp_p = df_spg7[df_spg7["local_case_id"]=="AAD-SAL-TRI-785-009"]  
test_not_2_lp = df_spg7[df_spg7["local_case_id"]=="ALS2039"]
test_hom_many = df_spg7[df_spg7["local_case_id"]=="Family-HSP-139"]
test_gross_hom = df_spg7[df_spg7["local_case_id"]=="ID34038"]

In [25]:
# true
tier1_screen_case(test_dup_del_lp, final_lp_p_id)

True

In [26]:
# true
tier1_screen_case(test_2_lp_p, final_lp_p_id)

True

In [27]:
# false
tier1_screen_case(test_not_2_lp, final_lp_p_id)

False

In [28]:
# true
tier1_screen_case(test_hom_many, final_lp_p_id)

True

In [29]:
# true
tier1_screen_case(test_gross_hom, final_lp_p_id)

True

### Run on all cases

In [25]:
# collect case local id having 2 lp/p variants or 1 lp/p variant + gross del/dup
solved_cases = []
for case in stream_cases(unique_case_list, df_spg7):
    patient = case[0]
    df = case[1]
    if tier1_screen_case(df, final_lp_p_id):
        solved_cases.append(patient)
    else:
        pass

In [26]:
# extract cases from prospax collection
# flag them in df_spg7
def is_solved(x, solved_id):
    if x in solved_id:
        return "y"
    else:
        return "n"
solved_local_ids = {case[1] for case in solved_cases}
df_spg7["tier1_is_solved"] = df_spg7["local_case_id"].apply(lambda x: is_solved(x, solved_local_ids))
df_solved = df_spg7[df_spg7["local_case_id"].isin(solved_local_ids)]

In [27]:
print("Number of unique cases being tier1: {}".format(df_solved["local_case_id"].unique().shape[0]))

Number of unique cases being tier1: 201


In [42]:
# save cases being tier 1
df_solved.to_csv("SPG7/tier1/tier1_solved_case.tsv", sep="\t", index=False)
df_solved.to_excel("SPG7/tier1/tier1_solved_case.xlsx", index=False, freeze_panes=(1,4))

In [28]:
# extract variant list being lp/p and Tier1
tiers1_variant_id = df_solved[df_solved["Variant_based_id"].isin(final_lp_p_id)]["Variant_based_id"].unique()
tiers1_variants = df_spg7_extended[df_spg7_extended["Variant_based_id"].isin(tiers1_variant_id)]

In [43]:
# saving tier1: lp-p variants 
tiers1_variants.to_csv("SPG7/tier1/tiers_solved_variants.tsv", sep="\t", index=False)
tiers1_variants.to_excel("SPG7/tier1/tiers_solved_variants.xlsx", index=False, freeze_panes=(1,4))

In [29]:
len(df_solved["Variant_based_id"].unique())

69

# B - Tier 2: Cases carrying 1 likely pathogenic variants in combination with a VUS


## 5 -  Identify cases carrying 1 likely pathogenic variant in combination with at least 1 additional variant
- depending on the number of remaining cases, proceed to functional follow up or consider additional filtering steps
- criteria supporting pathogenicity: 
    - phenotype matches SACS/SPG7
    - variants segregate in the family
    - highly conserved, predicted deleterious (to be discussed)
    - missense variant is located in a functional domain
    - variant has functional effect (splicing, downregulation of protein, downstream functional effect)


In [30]:
# create unique patient (submitter-localsubjectid)
df_tier2_prep = df_spg7[df_spg7["tier1_is_solved"]=="n"]
# print(df_tier2_prep.shape)
df_tier2_prep = df_tier2_prep[df_tier2_prep["Variant_based_id"].isin(final_lp_p_id)]
# print(df_tier2_prep.shape)
tier2_unique_case_list = set([(row['submitter_id'],row['local_case_id']) 
                    for idx, row in df_tier2_prep.iterrows()])


** remember to use non-lp/p conserved&rare variants

In [31]:
rare_conserved_variants = df_freq_conserver_filtered["Variant_based_id"].unique()

** create list of benign-likelybenign variants to investigate

In [32]:
benign_clinvar = list(df_spg7_clinvar[df_spg7_clinvar["clinsig"].isin(['Likely_benign', 'Benign', 'Benign/Likely_benign'])]["Variant_based_id"].unique())

benign_uniprot = list(uniprot_var[uniprot_var["clinsig"]==1]["Variant_based_id"].unique())


In [33]:
final_list_benign = list(set(benign_clinvar + benign_uniprot))
# add flag in main variant collection - is_variant_benign reported
def is_benign_uniprot_clinvar(x, list_benign):
    if x in list_benign:
        return "y"
    else:
        return "n"
    
df_spg7_extended["has_benign_report"] = df_spg7_extended["Variant_based_id"].apply(lambda x: is_benign_uniprot_clinvar(x, final_list_benign))

In [34]:
def stream_cases(cases, df):
    for case in cases:
        yield((case, df[(df['local_case_id']==case[1]) & \
                 (df['submitter_id']==case[0])]))
        
def get_lp_p_variant(df, list_lp_p):
    list_df_lp_p = list(df[df["Variant_based_id"].isin(list_lp_p)]["Variant_based_id"])
    return list_df_lp_p

def get_rare_conserved_variant(df, list_conserved_rare):
    list_df_conserverd_rare = list(df[df["Variant_based_id"].isin(list_conserved_rare)]["Variant_based_id"])
    return list_df_conserverd_rare

def get_benign_variant(df, list_benign):
    list_df_benign = list(df[df["Variant_based_id"].isin(list_benign)]["Variant_based_id"])
    return list_df_benign

def get_all_var_not_lpp(df, v_lp_p):
    v_not_lpp = [v for v in df["Variant_based_id"].unique() if v not in v_lp_p]
    return v_not_lpp

def get_hyp_vus(v_not_lpp, v_benign, v_rare_conserved):
    hyp_vus = [v for v in v_not_lpp if v not in v_benign and v in v_rare_conserved]
    return hyp_vus

def is_tier2(df, df_length, v_lp_p, v_benign, v_rare_conserved):
    if df_length==2:
        if len(v_lp_p)==1 and len(v_benign)==0 and len(v_rare_conserved)==1:
            return True
        else:
            return False
    else:
        not_lpp_v = get_all_var_not_lpp(df, v_lp_p)
        hyp_vus = get_hyp_vus(not_lpp_v, v_benign, v_rare_conserved)
        if hyp_vus:
            return True
        else: 
            return False
        
def tier2_screen_case(df, list_lp_p, list_conserved_rare, list_benign):
    if df.shape[0]==1:
        print("problem 1 variant only", list(df["Variant_based_id"]))
    else:
        v_lp_p = get_lp_p_variant(df, list_lp_p)
        # print("lp_p", len(v_lp_p))
        v_benign = get_benign_variant(df, list_benign)
        # print("benign", len(v_benign))
        v_rare_conserved = get_rare_conserved_variant(df, list_conserved_rare)
        # print("conserved_rare", len(v_rare_conserved))
        df_length = df.shape[0]
        
        return is_tier2(df, df_length, v_lp_p, v_benign, v_rare_conserved)
            
        
        
            
        

In [35]:
tier2_cases = []
tier2_cases_rejected = []
for case in stream_cases(tier2_unique_case_list, df_spg7):
    patient = case[0]
    df = case[1]
    if tier2_screen_case(df, final_lp_p_id, final_conserved_variant_id, final_list_benign):
        tier2_cases.append(patient)
    else:
        tier2_cases_rejected.append(patient)
        print(patient, "not tier2")

('P6 - Istanbul', 'MYO19') not tier2
('P1 - Tübingen', 'ID28189') not tier2
('P4 - Pisa', '2424A') not tier2
('P6 - Istanbul', 'FTD56') not tier2


In [36]:
# extract cases from prospax collection
# flag them in df_spg7
def flag_tier2(x, tier2_id, tier2_cases_rejected):
    if x in tier2_id:
        return "y"
    elif x in tier2_cases_rejected:
        return "rejected_automatic_not_rare/conserved_or_benign_report"
    else:
        return "n"

def flag_tier2_variant_lvl(x, lp_p, rare_conserved, benign):
    if x in lp_p:
        return "lp_p_variant"
    elif x in rare_conserved and x not in benign:
        return "rare_conserved_variant"
    elif x not in rare_conserved and x not in benign:
        return "NOT_rare_conserved_variant"
    elif x not in rare_conserved and x in benign:
        return "NOT_rare_conserved_variant_AND_benign_report"
    
tier2_local_ids = {case[1] for case in tier2_cases}
tier2_rejected_local_ids = {case[1] for case in tier2_cases_rejected}
df_spg7["is_tier2"] = df_spg7["local_case_id"].apply(lambda x: flag_tier2(x, tier2_local_ids, tier2_rejected_local_ids))
df_tier2 = df_spg7[df_spg7["local_case_id"].isin(tier2_local_ids)]
df_tier2["variant_level_status"] = df_tier2["Variant_based_id"].apply(lambda x: flag_tier2_variant_lvl(x, final_lp_p_id, final_conserved_variant_id, final_list_benign))
df_tier2_rejected = df_spg7[df_spg7["local_case_id"].isin(tier2_rejected_local_ids)]
df_tier2_rejected["variant_level_status"] = df_tier2_rejected["Variant_based_id"].apply(lambda x: flag_tier2_variant_lvl(x, final_lp_p_id, final_conserved_variant_id, final_list_benign))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
# create column to flag questionable variants 
def flag_questionable(x):
    if x == "lp_p_variant":
        return 0
    elif x == "rare_conserved_variant":
        return 2
    elif x in ["NOT_rare_conserved_variant", "NOT_rare_conserved_variant_AND_benign_report"]:
        return 1
    
df_tier2["is_questionable"] = df_tier2["variant_level_status"].apply(lambda x: flag_questionable(x))
df_tier2_rejected["is_questionable"] = df_tier2_rejected["variant_level_status"].apply(lambda x: flag_questionable(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [38]:
df_tier2_variants = df_spg7_extended[df_spg7_extended["Variant_based_id"].isin(df_tier2["Variant_based_id"].unique())]

In [39]:
def flag_questionable_var(x):
    if x == "lp_p":
        return 0
    else:
        return 1
df_tier2_variants["is_questionable_variant"] = df_tier2_variants["is_conserved"].apply(lambda x: flag_questionable_var(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [40]:
df_tier2_variants.to_csv("SPG7/tier2/tiers2_cases_variants.tsv", sep="\t", index=False)
df_tier2_variants.to_excel("SPG7/tier2/tiers2_cases_variants.xlsx", index=False, freeze_panes=(1,4))
df_tier2.to_csv("SPG7/tier2/tier2_cases.tsv", sep="\t", index=False)
df_tier2.to_excel("SPG7/tier2/tier2_cases.xlsx", index=False, freeze_panes=(1,4))
df_tier2_rejected.to_csv("SPG7/tier2/tier2_rejected_cases.tsv", sep="\t", index=False)
df_tier2_rejected.to_excel("SPG7/tier2/tier2_rejected_cases.xlsx", index=False, freeze_panes=(1,4))

In [41]:
len(df_tier2["local_case_id"].unique())

45

### Tests

In [51]:
# False - one lp-p + benign + not conserved
test_one_lp_benign = df_spg7[df_spg7["local_case_id"]=="2424A"]
tier2_screen_case(test_one_lp_benign, final_lp_p_id, final_conserved_variant_id, final_list_benign)

lp_p 1
benign 2
conserved_rare 0


False

In [65]:
# True
test_true = df_spg7[df_spg7["local_case_id"]=="HSP10"]
tier2_screen_case(test_true, final_lp_p_id, final_conserved_variant_id, final_list_benign)

True

In [42]:
# False - one lp/p + one not conserved/rare variant
test_false_not_conserved_rare = df_spg7[df_spg7["local_case_id"]=="FTD56"]
tier2_screen_case(test_false_not_conserved_rare, final_lp_p_id, final_conserved_variant_id, final_list_benign)

lp_p 1
benign 0
conserved_rare 0


False

# C - Tier 3: Cases carrying 2 VUS

## 6 - Identify cases carrying 2 variants of unknown significance
- apply standard filters
- depending on the number of remaining cases, proceed to functional follow up or consider additional filtering steps
- criteria supporting pathogenicity: 
    - phenotype matches SACS/SPG7
    - variants segregate in the family
    - missense variant is located in a functional domain
    - variant has functional effect (splicing, downregulation of protein, downstream functional effect)


In [44]:
# get cases which are not included in tier1 and tier2
df_tier3_prep = df_spg7[(df_spg7["tier1_is_solved"]=="n") & (df_spg7["is_tier2"]=="n")]

print("Number of cases to investigate in the context of Tier3 analysis: {}".format(len(df_tier3_prep["local_case_id"].unique())))

Number of cases to investigate in the context of Tier3 analysis: 141


In [45]:
# cases to investigate
tier3_unique_case_list = df_tier3_prep["local_case_id"].unique()
# get variants id tier3
variants_tier3_unique_cases = df_tier3_prep["Variant_based_id"].unique()

In [46]:
# get variants from tier3_unique_case_list
df_pre_tier3_variants = df_spg7_extended[df_spg7_extended["Variant_based_id"].isin(variants_tier3_unique_cases)]

print("Number of variants to investigate in the context of Tier3 analysis: {}".format(df_pre_tier3_variants.shape[0]))

Number of variants to investigate in the context of Tier3 analysis: 213


In [47]:
# separate variants according to mutation type
# exonic / no synonymous / no splice variants region / checked they are all intronic atm
exonic = df_pre_tier3_variants[df_pre_tier3_variants["csq_minimal"].isin(["Missense", "Inframe_Indel"])]

# missense
missense_pre_tier3 = df_pre_tier3_variants[df_pre_tier3_variants["csq_minimal"]=="Missense"]

# inframe indel
inframe_indels_tier3 = df_pre_tier3_variants[df_pre_tier3_variants["csq_minimal"]=="Inframe_Indel"]

# untranslated_region
untranslated_regions_tier3 = df_pre_tier3_variants[df_pre_tier3_variants["csq_minimal"].isin(['Intergene', 'UTR', 'Intron', 'Splice_region'])]

# synonymous
synonymous = df_pre_tier3_variants[df_pre_tier3_variants["csq_minimal"]=="Synonymous"]


### apply filtration to whole pre_tier3 collection (considering all mutation type)
- two filtration:
    - deleteriouness 
    - spliceai

In [48]:
# create filtration 
def filter_tier3(df):
    df = df[((df['metalr_pred'] == "D") | (df['metalr_pred'].isnull())) &
           ((df['m_cap_pred'] == "D") | (df['m_cap_pred'].isnull())) &
           ((df['sift']=="deleterious") | (df['sift'].isnull())) &
           ((df['polyphen'].isin(["probably_damaging", "possibly_damaging"]))  | (df['polyphen'].isnull())) &
           ((df['mutationtaster_pred']=='D&D') | (df['mutationtaster_pred']=='D') | (df['mutationtaster_pred']=='A') | (df['mutationtaster_pred'].isnull())) &
           ((df['cadd']>=(15)) | (df['cadd'].isnull())) & 
           ((df['dann']>=(0.98)) | (df['dann'].isnull())) &
           ((df['revel']>=(0.5)) | (df['revel'].isnull())) &
           ((df['provean_pred'].isin(["D&D", "D"])) | (df['provean_pred'].isnull()))]
    return df

def filter_spliceai(df, min_score=0.2):
    df = df[
            (df['spliceai_pred_DS_AG']>=min_score) |
            (df['spliceai_pred_DS_AL']>=min_score) |
            (df['spliceai_pred_DS_DG']>=min_score) |
            (df['spliceai_pred_DS_DL']>=min_score) 
    ]
    return df

# main filtration
df_filtered_pre_tier3 = filter_tier3(exonic)
    
# spliceai filtration
df_splice_filtered_pre_tier3 = filter_spliceai(df_pre_tier3_variants)

In [49]:
df_filtered_pre_tier3.shape

(18, 58)

In [50]:
df_splice_filtered_pre_tier3.shape

(2, 58)

In [51]:
# merge splice and filtered variants filtered
df_filtered_tier3_stage1 = pd.concat([df_filtered_pre_tier3, df_splice_filtered_pre_tier3])

In [52]:
# from filtered list fetch cases with variants
filtered_variants_tiers3 = df_filtered_tier3_stage1["Variant_based_id"].unique()
# get cases id
cases_filtered_tier3_stage1 = df_tier3_prep[df_tier3_prep["Variant_based_id"].isin(filtered_variants_tiers3)]["local_case_id"].unique()
# get df
df_tier3_cases_stage1 = df_tier3_prep[df_tier3_prep["local_case_id"].isin(cases_filtered_tier3_stage1)]

# did variant pass filtration?
def flag_variant_filter(x, pass_v_l):
    if x in pass_v_l:
        return "y"
    else: 
        return "n"
    
df_tier3_cases_stage1["pass_tier3_filtration"] = df_tier3_cases_stage1["Variant_based_id"].apply(lambda x: flag_variant_filter(x, filtered_variants_tiers3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [53]:
# save variants passing filtration
df_filtered_tier3_stage1.to_csv("SPG7/tier3/stage1_tier3_variants.tsv", sep="\t", index=False)
df_filtered_tier3_stage1.to_excel("SPG7/tier3/stage1_tier3_variants.xlsx", index=False, freeze_panes=(1,4))

# save cases carrying variant
df_tier3_cases_stage1.to_csv("SPG7/tier3/stage1_tier3_cases.tsv", sep="\t", index=False)
df_tier3_cases_stage1.to_excel("SPG7/tier3/stage1_tier3_cases.xlsx", index=False, freeze_panes=(1,4))

In [54]:
len(df_tier3_cases_stage1["local_case_id"].unique())

32

# Define hotspot regions

- Located in a mutational hot spot and/or critical and well-established functional domain without benign variation. (Pathogenic, Moderate)
    - based on clinical variants (benign/likely benign vs pathogenic/likely pathogenic)

- exons ±5 bp (to include splice donor and acceptor):
    - using 25bp on either side of the variant, the rule checks that there are at least 5 pathogenic variants (only using missense and inframe-indel variants).
- protein domains:
    - if the variant is within a functional domain
    - checks that at least 2 pathogenic variants have been reported within the domain. 
    - trigger if the ratio of pathogenic to the total variants is greater than 0.2537 with strength 'supporting', and 0.2875 with strength 'moderate'


In [75]:
# create pool benign variants 
# combine clinvar uniprot benign reported
uniprot_benign = uniprot_var[uniprot_var["clinsig"]==1].reset_index(drop=True)[['Variant_based_id', 'chrom', 'pos', 'ref', 'alt', 'csq', 'gene',
       'hgvsc', 'hgvsp', 'clinsig', 'hgvsp_clean', 'hgvsp_short',
       'protein_pos', 'domain_name', 'cat_domain']]
clinvar_benign = df_spg7_clinvar[df_spg7_clinvar["clinsig"].isin(['Likely_benign', 'Benign', 'Benign/Likely_benign'])].reset_index(drop=True)[['Variant_based_id', 'chrom', 'pos', 'ref', 'alt', 'csq', 'gene',
       'hgvsc', 'hgvsp', 'clinsig', 'hgvsp_clean', 'hgvsp_short',
       'protein_pos', 'domain_name', 'cat_domain']]

df_benign = pd.concat([uniprot_benign, clinvar_benign], sort=False)
df_benign = df_benign.drop_duplicates(subset='Variant_based_id', keep="last")

In [76]:
# create pool lp/p variants 
# combine clinvar uniprot and flagged lp/p
uniprot_lp_p = uniprot_var[uniprot_var["clinsig"]==3].reset_index(drop=True)[['Variant_based_id', 'chrom', 'pos', 'ref', 'alt', 'csq', 'gene',
       'hgvsc', 'hgvsp', 'hgvsp_clean', 'hgvsp_short',
       'protein_pos', 'domain_name', 'cat_domain']]

clinvar_lp_p = df_spg7_clinvar[df_spg7_clinvar["clinsig"].isin(['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic'])].reset_index(drop=True)[['Variant_based_id', 'chrom', 'pos', 'ref', 'alt', 'csq', 'gene',
       'hgvsc', 'hgvsp', 'hgvsp_clean', 'hgvsp_short',
       'protein_pos', 'domain_name', 'cat_domain']]


lp_p_prospax = df_spg7_extended[df_spg7_extended["Variant_based_id"].isin(final_lp_p_id)].reset_index(drop=True)[['Variant_based_id', 'chrom', 'pos', 'ref', 'alt', 'csq', 'gene',
       'hgvsc', 'hgvsp', 'hgvsp_clean', 'hgvsp_short',
       'protein_pos', 'domain_name', 'cat_domain']]


df_lp_p = pd.concat([uniprot_lp_p, clinvar_lp_p, lp_p_prospax], sort=False)
df_lp_p = df_lp_p.drop_duplicates(subset='Variant_based_id', keep="last")

In [None]:
def open_window(x):
    up_x = x - 25
    down_x = x + 25
    window = (up_x, down_x)
    return window

def is_benign_in_window(window):
    
    
    
    
    
    
    

In [83]:
# 
df_spg7[df_spg7["case_status"]=="solved SPG7"].drop_duplicates(subset=["submitter_id", "local_case_id"])

Unnamed: 0,submitter_id,local_case_id,local_family_id,prospax_case_id,ngs_database_id,main_phenotype,case_status,id,gene,chrom,...,prot_change,genotype,compound_het_id_s,paxgene_availability,pbmc_availability,fibroblasts_availability,comments,Variant_based_id,tier1_is_solved,is_tier2
0,P1 - Tübingen,ID11205,FN11205,843-594-125,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_1,SPG7,16,...,p.Ala510Val,Hom,,no,no,no,,16-89613145-C-T,y,n
1,P1 - Tübingen,ID11206,FN11205,409-682-435,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_2,SPG7,16,...,p.Ala510Val,Hom,,no,no,no,,16-89613145-C-T,y,n
2,P1 - Tübingen,ID12357,FN12357,855-065-005,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_4,SPG7,16,...,,Het,P1_SPG7_Tubingen_3,yes,yes,yes,,16-89613169-G-T,y,n
4,P1 - Tübingen,ID14354,FN14354,892-430-990,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_5,SPG7,16,...,p.Trp29Ter,Hom,,no,no,yes,,16-89574911-G-A,y,n
5,P1 - Tübingen,ID14645,FN14645,023-180-504,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_8,SPG7,16,...,p.Glu484GlyfsTer74,Hom,,yes,no,no,,16-89613169-G-T,y,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,P7 - Paris,AAR-SAL-DUR-282-003,AAR-SAL-DUR-282,NONE,Paris diagnostic pipeline,ataxia,solved SPG7,P7_SPG7_Paris_167,SPG7,16,...,p.Ala510Val,Het,P7_SPG7_Paris_168,no,yes,no,NONE,16-89613145-C-T,y,n
871,P7 - Paris,AAR-SAL-DUR-282-005,AAR-SAL-DUR-282,NONE,Paris diagnostic pipeline,ataxia,solved SPG7,P7_SPG7_Paris_169,SPG7,16,...,p.Ala510Val,Het,P7_SPG7_Paris_170,no,yes,no,NONE,16-89613145-C-T,y,n
873,P7 - Paris,FSP-SAL-COU-1416-001,NONE,NONE,Paris diagnostic pipeline,HSP,solved SPG7,P7_SPG7_Paris_171,SPG7,16,...,p.Pro350GlnfsTer36,Het,P7_SPG7_Paris_172,no,yes,no,NONE,16-89598372-CCCCCCGGCTGTGGGAAGACGCTGCTGGCC-C,n,y
875,P7 - Paris,FSP-SAL-SER-1476-001,NONE,NONE,Paris diagnostic pipeline,HSP,solved SPG7,P7_SPG7_Paris_173,SPG7,16,...,p.Arg485_Glu487del,Het,P7_SPG7_Paris_174,no,yes,no,NONE,16-89613069-AGGCGGGAGA-A,n,y


In [85]:
df_solved[df_solved["case_status"]=="solved SPG7"].drop_duplicates(subset=["submitter_id", "local_case_id"])

Unnamed: 0,submitter_id,local_case_id,local_family_id,prospax_case_id,ngs_database_id,main_phenotype,case_status,id,gene,chrom,...,cdna,prot_change,genotype,compound_het_id_s,paxgene_availability,pbmc_availability,fibroblasts_availability,comments,Variant_based_id,tier1_is_solved
0,P1 - Tübingen,ID11205,FN11205,843-594-125,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_1,SPG7,16,...,c.1529C>T,p.Ala510Val,Hom,,no,no,no,,16-89613145-C-T,y
1,P1 - Tübingen,ID11206,FN11205,409-682-435,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_2,SPG7,16,...,c.1529C>T,p.Ala510Val,Hom,,no,no,no,,16-89613145-C-T,y
2,P1 - Tübingen,ID12357,FN12357,855-065-005,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_4,SPG7,16,...,c.1552+1G>T,,Het,P1_SPG7_Tubingen_3,yes,yes,yes,,16-89613169-G-T,y
4,P1 - Tübingen,ID14354,FN14354,892-430-990,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_5,SPG7,16,...,c.86G>A,p.Trp29Ter,Hom,,no,no,yes,,16-89574911-G-A,y
5,P1 - Tübingen,ID14645,FN14645,023-180-504,TreatHSP Registry,HSP,solved SPG7,P1_SPG7_Tubingen_8,SPG7,16,...,c.1552+1G>T,p.Glu484GlyfsTer74,Hom,,yes,no,no,,16-89613169-G-T,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
864,P7 - Paris,FSP-SAL-DEL-1469-001,NONE,NONE,Paris diagnostic pipeline,HSP,solved SPG7,P7_SPG7_Paris_162,SPG7,16,...,c.1529C>T,p.Ala510Val,Het,P7_SPG7_Paris_163,no,yes,no,NONE,16-89613145-C-T,y
868,P7 - Paris,AAD-SAL-BON-1193-001,NONE,NONE,Paris diagnostic pipeline,ataxia,solved SPG7,P7_SPG7_Paris_166,SPG7,16,...,c.1529C>T,p.Ala510Val,Hom,NONE,no,yes,no,NONE,16-89613145-C-T,y
869,P7 - Paris,AAR-SAL-DUR-282-003,AAR-SAL-DUR-282,NONE,Paris diagnostic pipeline,ataxia,solved SPG7,P7_SPG7_Paris_167,SPG7,16,...,c.1529C>T,p.Ala510Val,Het,P7_SPG7_Paris_168,no,yes,no,NONE,16-89613145-C-T,y
871,P7 - Paris,AAR-SAL-DUR-282-005,AAR-SAL-DUR-282,NONE,Paris diagnostic pipeline,ataxia,solved SPG7,P7_SPG7_Paris_169,SPG7,16,...,c.1529C>T,p.Ala510Val,Het,P7_SPG7_Paris_170,no,yes,no,NONE,16-89613145-C-T,y


In [86]:
df_spg7.drop_duplicates(subset=["submitter_id", "local_case_id"]).shape

(397, 25)