In [1]:
import pandas as pd
import numpy as np

In [2]:
#Downloaded only the relevant results for each gene from CLinVar, may have to change depeending on results

cv_mybpc3_df = pd.read_csv("data/mybpc3_data.txt", delimiter="\t")
cv_myh7_df = pd.read_csv("data/myh7_data.txt", delimiter="\t")

#Removed last reviewed date from the disease status
cv_myh7_df["label"] = cv_myh7_df["Clinical significance (Last reviewed)"].apply(lambda x: x.split("(")[0])
cv_myh7_df.drop(columns=["Clinical significance (Last reviewed)"], inplace=True)

cv_mybpc3_df["label"] = cv_mybpc3_df["Clinical significance (Last reviewed)"].apply(lambda x: x.split("(")[0])
cv_mybpc3_df.drop(columns=["Clinical significance (Last reviewed)"], inplace=True)


cv_mybpc3_df.head()
cv_myh7_df.head()

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Unnamed: 15,label
0,NM_000257.4(MYH7):c.816A>T (p.Arg272Ser),MYH7,R272S,not provided,"criteria provided, single submitter",VCV002663533,14,23900189,14,23430980,2663533,2830981,,NC_000014.9:23430979:T:A,,Uncertain significance
1,NM_000257.4(MYH7):c.1072C>T (p.His358Tyr),MYH7,H358Y,not provided,"criteria provided, single submitter",VCV002663260,14,23899050,14,23429841,2663260,2830710,,NC_000014.9:23429840:G:A,,Uncertain significance
2,NM_000257.4(MYH7):c.196G>A (p.Gly66Ser),MYH7,G66S,not provided,"criteria provided, single submitter",VCV002662427,14,23902746,14,23433537,2662427,2829878,,NC_000014.9:23433536:C:T,,Uncertain significance
3,NM_000257.4(MYH7):c.639+1G>T,MYH7,,not provided,"criteria provided, single submitter",VCV002644107,14,23900969,14,23431760,2644107,2810291,,NC_000014.9:23431759:C:A,,Uncertain significance
4,NM_000257.4(MYH7):c.1669C>A (p.Leu557Met),MYH7,L557M,not provided,"criteria provided, single submitter",VCV002644106,14,23897013,14,23427804,2644106,2810290,,NC_000014.9:23427803:G:T,,Uncertain significance


In [3]:
# Load AlphaMissense predictions for the two genes

am_myh7_df = pd.read_csv("data/am_myh7_data.tsv", delimiter="\t")
am_mybpc3_df = pd.read_csv("data/am_mybpc3_data.tsv", delimiter="\t")

am_myh7_df.head()

Unnamed: 0,CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr14,23412857,C,G,hg38,P12883,ENST00000355349.4,E1935D,0.0737,likely_benign
1,chr14,23412857,C,A,hg38,P12883,ENST00000355349.4,E1935D,0.0737,likely_benign
2,chr14,23412858,T,G,hg38,P12883,ENST00000355349.4,E1935A,0.2034,likely_benign
3,chr14,23412858,T,C,hg38,P12883,ENST00000355349.4,E1935G,0.1999,likely_benign
4,chr14,23412858,T,A,hg38,P12883,ENST00000355349.4,E1935V,0.2693,likely_benign


In [4]:
#Load EVE predictions for the two genes

eve_myh7_df = pd.read_csv("data/eve_myh7_data.csv")
eve_mybpc3_df = pd.read_csv("data/eve_mybpc3_data.csv")

eve_myh7_df.head()

Unnamed: 0,wt_aa,position,mt_aa,ClinVar_ClinicalSignificance,Gold_Stars,NumberSubmitters,Starred_Coarse_Grained_Clin_Sig,frequency_gv2,frequency_gv3,evolutionary_index_ASM,...,p_model,b_acmg_model,lb_acmg_model,lp_acmg_model,coarse_clinical_significance_post,clinical_significance_post,model_disagreement,CV_label_and_model_combined_with_other_evidence_disagreement,starred_label_and_model_disagreement,starred_label_and_model_combined_with_other_evidence_disagreement
0,M,1,A,,,,,,,,...,False,False,False,False,,,False,False,False,False
1,M,1,C,,,,,,,,...,False,False,False,False,,,False,False,False,False
2,M,1,D,,,,,,,,...,False,False,False,False,,,False,False,False,False
3,M,1,E,,,,,,,,...,False,False,False,False,,,False,False,False,False
4,M,1,F,,,,,,,,...,False,False,False,False,,,False,False,False,False


In [5]:
#Preprocess ClinVar data

for cv_df in [cv_myh7_df, cv_mybpc3_df]:
    drop_idx_conflicting = cv_df[(cv_df["label"] == 'Conflicting interpretations of pathogenicity') | (cv_df["label"] == 'conflicting data from submitters') | (cv_df["label"] == 'not provided')].index
    drop_idx_unprov = cv_df[cv_df["label"] == 'not provided'].index

    #Normalize ClinVar labels to match those of AM
    cv_df.drop(drop_idx_conflicting, inplace=True)

    print(len(cv_df[cv_df["label"].str.contains("Benign") | cv_df["label"].str.contains("benign")]))
    cv_df["label"] = cv_df["label"].apply(lambda x: "ambiguous" if x == "Uncertain significance" else ("likely_benign" if "Benign" in x or "benign" in x else "likely_pathogenic"))

    print(len(cv_df.loc[cv_df["label"] == "likely_benign"]))


1279
1279
870
870


In [22]:
#Find AM discrepancies with ClinVar for both genes

mismatch_dict = {}
for cv_df, am_df, gene in [(cv_myh7_df, am_myh7_df, "myh7"), (cv_mybpc3_df, am_mybpc3_df, "mybpc3")]:

    mismatch_df = pd.DataFrame()
    for i in range(len(cv_df)):
        matched_am_pred = am_df[am_df["protein_variant"] == cv_df.iloc[i]["Protein change"]]


        if not matched_am_pred.empty:
            matched_am_pred = matched_am_pred.iloc[0]

            exp_class = cv_df.iloc[i]["label"]
            status = cv_df.iloc[i]["Review status"]
            pred_class = matched_am_pred["am_class"]

            if exp_class != "ambiguous":
                if exp_class != pred_class:

                    matched_am_pred["ClinVar label"] = exp_class
                    matched_am_pred["ClinVar status"] = status

                    # print(matched_am_pred)
                    # print("flow")

                    mismatch_df = mismatch_df.append(matched_am_pred)
    
    mismatch_dict[gene] = mismatch_df

  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_d

In [23]:
#Observe mismatches for both genes

print(len(mismatch_dict["myh7"]))
mismatch_dict["myh7"].head(50)


61


Unnamed: 0,CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ClinVar label,ClinVar status
8260,chr14,23426061,C,G,hg38,P12883,ENST00000355349.4,V689L,0.9478,likely_pathogenic,likely_benign,"criteria provided, single submitter"
11544,chr14,23431801,G,A,hg38,P12883,ENST00000355349.4,A200V,0.5483,ambiguous,likely_pathogenic,"criteria provided, single submitter"
8838,chr14,23427672,G,C,hg38,P12883,ENST00000355349.4,L601V,0.4057,ambiguous,likely_pathogenic,"criteria provided, single submitter"
5822,chr14,23422262,G,T,hg38,P12883,ENST00000355349.4,L1055M,0.1496,likely_benign,likely_pathogenic,"criteria provided, single submitter"
10066,chr14,23429095,C,T,hg38,P12883,ENST00000355349.4,A423T,0.1865,likely_benign,likely_pathogenic,no assertion criteria provided
5917,chr14,23422306,T,C,hg38,P12883,ENST00000355349.4,Q1040R,0.2753,likely_benign,likely_pathogenic,no assertion criteria provided
7008,chr14,23424818,A,G,hg38,P12883,ENST00000355349.4,M877T,0.2905,likely_benign,likely_pathogenic,no assertion criteria provided
375,chr14,23414026,T,C,hg38,P12883,ENST00000355349.4,K1879R,0.2235,likely_benign,likely_pathogenic,"criteria provided, single submitter"
8508,chr14,23427241,C,T,hg38,P12883,ENST00000355349.4,R652K,0.5207,ambiguous,likely_pathogenic,"criteria provided, single submitter"
5979,chr14,23423556,T,G,hg38,P12883,ENST00000355349.4,Q1030H,0.3505,ambiguous,likely_benign,"criteria provided, single submitter"


In [26]:
print(len(mismatch_dict["mybpc3"]))
mismatch_dict["mybpc3"].head(50)

# mismatch_dict["mybpc3"]["ClinVar status"].iloc[10]

20


Unnamed: 0,CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ClinVar label,ClinVar status
11980,chr11,47346217,C,G,hg38,Q14896,ENST00000545968.6,K360N,0.4657,ambiguous,likely_pathogenic,"criteria provided, single submitter"
8006,chr11,47339740,C,T,hg38,Q14896,ENST00000545968.6,V660M,0.2616,likely_benign,likely_pathogenic,"criteria provided, single submitter"
10810,chr11,47343029,A,G,hg38,Q14896,ENST00000545968.6,F448S,0.4113,ambiguous,likely_pathogenic,"criteria provided, single submitter"
10110,chr11,47342697,C,A,hg38,Q14896,ENST00000545968.6,R502L,0.4558,ambiguous,likely_pathogenic,"criteria provided, single submitter"
8084,chr11,47339758,G,C,hg38,Q14896,ENST00000545968.6,R654G,0.2447,likely_benign,likely_pathogenic,no assertion criteria provided
10774,chr11,47343021,C,G,hg38,Q14896,ENST00000545968.6,E451Q,0.1996,likely_benign,likely_pathogenic,"criteria provided, single submitter"
15890,chr11,47351356,T,C,hg38,Q14896,ENST00000545968.6,T59A,0.096,likely_benign,likely_pathogenic,no assertion criteria provided
11936,chr11,47346207,C,T,hg38,Q14896,ENST00000545968.6,A364T,0.1072,likely_benign,likely_pathogenic,"criteria provided, multiple submitters, no con..."
8846,chr11,47341991,C,T,hg38,Q14896,ENST00000545968.6,R597Q,0.1985,likely_benign,likely_pathogenic,"criteria provided, multiple submitters, no con..."
11694,chr11,47343571,G,A,hg38,Q14896,ENST00000545968.6,R382W,0.3417,ambiguous,likely_benign,"criteria provided, multiple submitters, no con..."


In [11]:
#Preprocess EVE dataframe

for eve_df in [eve_myh7_df, eve_mybpc3_df]:
    eve_df.dropna(subset="EVE_classes_75_pct_retained_ASM", inplace=True)

    #Standardize labels with those used above
    eve_df["EVE_classes_75_pct_retained_ASM"] = eve_df["EVE_classes_75_pct_retained_ASM"].apply(lambda x: "ambiguous" if x == "Uncertain" else ("likely_benign" if "Benign" in x or "benign" in x else "likely_pathogenic"))

    #Add mutation label to match those of ClinVar
    eve_df["protein_variant"] = eve_df["wt_aa"] + eve_df["position"].apply(lambda x: str(x)) + eve_df["mt_aa"]



eve_myh7_df["protein_variant"].head(20)

221    A12C
222    A12D
223    A12E
224    A12F
225    A12G
226    A12H
227    A12I
228    A12K
229    A12L
230    A12M
231    A12N
232    A12P
233    A12Q
234    A12R
235    A12S
236    A12T
237    A12V
238    A12W
239    A12Y
241    A13C
Name: protein_variant, dtype: object

In [16]:
#Find mismatches between EVE and ClinVar

for cv_df, eve_df, gene in [(cv_myh7_df, eve_myh7_df, "myh7"), (cv_mybpc3_df, eve_mybpc3_df, "mybpc3")]:

    mismatch_df = pd.DataFrame()
    for i in range(len(cv_df)):
        matched_eve_pred = eve_df[eve_df["protein_variant"] == cv_df.iloc[i]["Protein change"]]

        if not matched_eve_pred.empty:
            matched_eve_pred = matched_eve_pred.iloc[0]

            exp_class = cv_df.iloc[i]["label"]
            status = cv_df.iloc[i]["Review status"]
            pred_class = matched_eve_pred["EVE_classes_75_pct_retained_ASM"]

            if exp_class != "ambiguous":
                if exp_class != pred_class:

                    matched_eve_pred["ClinVar label"] = exp_class
                    matched_eve_pred["ClinVar status"] = status

                    # print(matched_eve_pred)
                    # print("flow")

                    mismatch_df = mismatch_df.append(matched_eve_pred)
    
    mismatch_dict[gene] = mismatch_df

  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve

In [20]:
print(len(mismatch_dict["myh7"]))
mismatch_dict["myh7"][["protein_variant", "EVE_scores_ASM", "EVE_classes_75_pct_retained_ASM", "ClinVar label", "ClinVar status"]].head(50)

54


Unnamed: 0,protein_variant,EVE_scores_ASM,EVE_classes_75_pct_retained_ASM,ClinVar label,ClinVar status
6033,L302Q,0.350133,likely_benign,likely_pathogenic,"criteria provided, single submitter"
13769,V689L,0.911706,likely_pathogenic,likely_benign,"criteria provided, single submitter"
12060,T604A,0.344009,likely_benign,likely_pathogenic,"criteria provided, single submitter"
8235,T412S,0.357886,likely_benign,likely_pathogenic,"criteria provided, single submitter"
7679,S384Y,0.329737,likely_benign,likely_pathogenic,no assertion criteria provided
20794,Q1040R,0.289391,likely_benign,likely_pathogenic,no assertion criteria provided
17536,M877T,0.167544,likely_benign,likely_pathogenic,no assertion criteria provided
13028,R652K,0.102893,likely_benign,likely_pathogenic,"criteria provided, single submitter"
13313,H666Q,0.082942,likely_benign,likely_pathogenic,no assertion criteria provided
12977,A649V,0.338544,likely_benign,likely_pathogenic,"criteria provided, single submitter"


In [21]:
print(len(mismatch_dict["mybpc3"]))
mismatch_dict["mybpc3"][["protein_variant", "EVE_scores_ASM", "EVE_classes_75_pct_retained_ASM", "ClinVar label", "ClinVar status"]].head(50)

9


Unnamed: 0,protein_variant,EVE_scores_ASM,EVE_classes_75_pct_retained_ASM,ClinVar label,ClinVar status
22876,M1144T,0.432483,likely_pathogenic,likely_benign,"criteria provided, single submitter"
13065,R654G,0.124543,likely_benign,likely_pathogenic,no assertion criteria provided
15222,V762D,0.271664,likely_benign,likely_pathogenic,"criteria provided, multiple submitters, no con..."
20038,R1002W,0.677674,likely_pathogenic,likely_benign,"criteria provided, multiple submitters, no con..."
7638,R382W,0.466912,likely_pathogenic,likely_benign,"criteria provided, multiple submitters, no con..."
22746,R1138H,0.842475,likely_pathogenic,likely_benign,"criteria provided, multiple submitters, no con..."
3526,R177H,0.560917,likely_pathogenic,likely_benign,"criteria provided, multiple submitters, no con..."
19943,Q998E,0.543869,likely_pathogenic,likely_benign,"criteria provided, multiple submitters, no con..."
17910,V896M,0.451316,likely_pathogenic,likely_benign,"criteria provided, multiple submitters, no con..."
