In [149]:
import pandas as pd
import numpy as np

In [150]:
#Downloaded only the relevant results for each gene from CLinVar, may have to change depeending on results

cv_mybpc3_df = pd.read_csv("data/mybpc3_data.txt", delimiter="\t")
cv_myh7_df = pd.read_csv("data/myh7_data.txt", delimiter="\t")

#Removed last reviewed date from the disease status
cv_myh7_df["label"] = cv_myh7_df["Clinical significance (Last reviewed)"].apply(lambda x: x.split("(")[0])
cv_myh7_df.drop(columns=["Clinical significance (Last reviewed)"], inplace=True)

cv_mybpc3_df["label"] = cv_mybpc3_df["Clinical significance (Last reviewed)"].apply(lambda x: x.split("(")[0])
cv_mybpc3_df.drop(columns=["Clinical significance (Last reviewed)"], inplace=True)


cv_mybpc3_df.head()
cv_myh7_df.head()

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Review status,Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Unnamed: 15,label
0,NM_000257.4(MYH7):c.816A>T (p.Arg272Ser),MYH7,R272S,not provided,"criteria provided, single submitter",VCV002663533,14,23900189,14,23430980,2663533,2830981,,NC_000014.9:23430979:T:A,,Uncertain significance
1,NM_000257.4(MYH7):c.1072C>T (p.His358Tyr),MYH7,H358Y,not provided,"criteria provided, single submitter",VCV002663260,14,23899050,14,23429841,2663260,2830710,,NC_000014.9:23429840:G:A,,Uncertain significance
2,NM_000257.4(MYH7):c.196G>A (p.Gly66Ser),MYH7,G66S,not provided,"criteria provided, single submitter",VCV002662427,14,23902746,14,23433537,2662427,2829878,,NC_000014.9:23433536:C:T,,Uncertain significance
3,NM_000257.4(MYH7):c.639+1G>T,MYH7,,not provided,"criteria provided, single submitter",VCV002644107,14,23900969,14,23431760,2644107,2810291,,NC_000014.9:23431759:C:A,,Uncertain significance
4,NM_000257.4(MYH7):c.1669C>A (p.Leu557Met),MYH7,L557M,not provided,"criteria provided, single submitter",VCV002644106,14,23897013,14,23427804,2644106,2810290,,NC_000014.9:23427803:G:T,,Uncertain significance


In [151]:
# Load AlphaMissense predictions for the two genes

am_myh7_df = pd.read_csv("data/am_myh7_data.tsv", delimiter="\t")
am_mybpc3_df = pd.read_csv("data/am_mybpc3_data.tsv", delimiter="\t")

am_myh7_df.head()

Unnamed: 0,CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr14,23412857,C,G,hg38,P12883,ENST00000355349.4,E1935D,0.0737,likely_benign
1,chr14,23412857,C,A,hg38,P12883,ENST00000355349.4,E1935D,0.0737,likely_benign
2,chr14,23412858,T,G,hg38,P12883,ENST00000355349.4,E1935A,0.2034,likely_benign
3,chr14,23412858,T,C,hg38,P12883,ENST00000355349.4,E1935G,0.1999,likely_benign
4,chr14,23412858,T,A,hg38,P12883,ENST00000355349.4,E1935V,0.2693,likely_benign


In [152]:
#Load EVE predictions for the two genes

eve_myh7_df = pd.read_csv("data/eve_myh7_data.csv")
eve_mybpc3_df = pd.read_csv("data/eve_mybpc3_data.csv")

eve_myh7_df.head()

Unnamed: 0,wt_aa,position,mt_aa,ClinVar_ClinicalSignificance,Gold_Stars,NumberSubmitters,Starred_Coarse_Grained_Clin_Sig,frequency_gv2,frequency_gv3,evolutionary_index_ASM,...,p_model,b_acmg_model,lb_acmg_model,lp_acmg_model,coarse_clinical_significance_post,clinical_significance_post,model_disagreement,CV_label_and_model_combined_with_other_evidence_disagreement,starred_label_and_model_disagreement,starred_label_and_model_combined_with_other_evidence_disagreement
0,M,1,A,,,,,,,,...,False,False,False,False,,,False,False,False,False
1,M,1,C,,,,,,,,...,False,False,False,False,,,False,False,False,False
2,M,1,D,,,,,,,,...,False,False,False,False,,,False,False,False,False
3,M,1,E,,,,,,,,...,False,False,False,False,,,False,False,False,False
4,M,1,F,,,,,,,,...,False,False,False,False,,,False,False,False,False


In [153]:
#Preprocess ClinVar data

for cv_df in [cv_myh7_df, cv_mybpc3_df]:
    drop_idx_conflicting = cv_df[(cv_df["label"] == 'Conflicting interpretations of pathogenicity') | (cv_df["label"] == 'conflicting data from submitters') | (cv_df["label"] == 'not provided')].index
    drop_idx_unprov = cv_df[cv_df["label"] == 'not provided'].index

    #Normalize ClinVar labels to match those of AM
    cv_df.drop(drop_idx_conflicting, inplace=True)

    print(len(cv_df[cv_df["label"].str.contains("Benign") | cv_df["label"].str.contains("benign")]))
    cv_df["label"] = cv_df["label"].apply(lambda x: "ambiguous" if x == "Uncertain significance" else ("likely_benign" if "Benign" in x or "benign" in x else "likely_pathogenic"))

    print(len(cv_df.loc[cv_df["label"] == "likely_benign"]))


1279
1279
870
870


In [154]:
#Find AM discrepancies with ClinVar for both genes

mismatch_dict = {}
for cv_df, am_df, gene in [(cv_myh7_df, am_myh7_df, "am_myh7"), (cv_mybpc3_df, am_mybpc3_df, "am_mybpc3")]:

    mismatch_df = pd.DataFrame()
    print()
    for i in range(len(cv_df)):
        matched_am_pred = am_df[am_df["protein_variant"] == cv_df.iloc[i]["Protein change"]]


        if not matched_am_pred.empty:
            matched_am_pred = matched_am_pred.iloc[0]

            exp_class = cv_df.iloc[i]["label"]
            status = cv_df.iloc[i]["Review status"]
            pred_class = matched_am_pred["am_class"]

            if exp_class != "ambiguous":
                if exp_class != pred_class:

                    matched_am_pred["ClinVar label"] = exp_class
                    matched_am_pred["ClinVar status"] = status

                    # print(matched_am_pred)
                    # print("flow")

                    mismatch_df = mismatch_df.append(matched_am_pred)
    
    mismatch_dict[gene] = mismatch_df

  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_df = mismatch_df.append(matched_am_pred)
  mismatch_d

In [155]:
#Observe mismatches for both genes

print(len(mismatch_dict["am_myh7"]))
mismatch_dict["am_myh7"].head(50)


61


Unnamed: 0,CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ClinVar label,ClinVar status
8260,chr14,23426061,C,G,hg38,P12883,ENST00000355349.4,V689L,0.9478,likely_pathogenic,likely_benign,"criteria provided, single submitter"
11544,chr14,23431801,G,A,hg38,P12883,ENST00000355349.4,A200V,0.5483,ambiguous,likely_pathogenic,"criteria provided, single submitter"
8838,chr14,23427672,G,C,hg38,P12883,ENST00000355349.4,L601V,0.4057,ambiguous,likely_pathogenic,"criteria provided, single submitter"
5822,chr14,23422262,G,T,hg38,P12883,ENST00000355349.4,L1055M,0.1496,likely_benign,likely_pathogenic,"criteria provided, single submitter"
10066,chr14,23429095,C,T,hg38,P12883,ENST00000355349.4,A423T,0.1865,likely_benign,likely_pathogenic,no assertion criteria provided
5917,chr14,23422306,T,C,hg38,P12883,ENST00000355349.4,Q1040R,0.2753,likely_benign,likely_pathogenic,no assertion criteria provided
7008,chr14,23424818,A,G,hg38,P12883,ENST00000355349.4,M877T,0.2905,likely_benign,likely_pathogenic,no assertion criteria provided
375,chr14,23414026,T,C,hg38,P12883,ENST00000355349.4,K1879R,0.2235,likely_benign,likely_pathogenic,"criteria provided, single submitter"
8508,chr14,23427241,C,T,hg38,P12883,ENST00000355349.4,R652K,0.5207,ambiguous,likely_pathogenic,"criteria provided, single submitter"
5979,chr14,23423556,T,G,hg38,P12883,ENST00000355349.4,Q1030H,0.3505,ambiguous,likely_benign,"criteria provided, single submitter"


In [156]:
print(len(mismatch_dict["am_mybpc3"]))
mismatch_dict["am_mybpc3"].head(50)

# mismatch_dict["mybpc3"]["ClinVar status"].iloc[10]

20


Unnamed: 0,CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ClinVar label,ClinVar status
11980,chr11,47346217,C,G,hg38,Q14896,ENST00000545968.6,K360N,0.4657,ambiguous,likely_pathogenic,"criteria provided, single submitter"
8006,chr11,47339740,C,T,hg38,Q14896,ENST00000545968.6,V660M,0.2616,likely_benign,likely_pathogenic,"criteria provided, single submitter"
10810,chr11,47343029,A,G,hg38,Q14896,ENST00000545968.6,F448S,0.4113,ambiguous,likely_pathogenic,"criteria provided, single submitter"
10110,chr11,47342697,C,A,hg38,Q14896,ENST00000545968.6,R502L,0.4558,ambiguous,likely_pathogenic,"criteria provided, single submitter"
8084,chr11,47339758,G,C,hg38,Q14896,ENST00000545968.6,R654G,0.2447,likely_benign,likely_pathogenic,no assertion criteria provided
10774,chr11,47343021,C,G,hg38,Q14896,ENST00000545968.6,E451Q,0.1996,likely_benign,likely_pathogenic,"criteria provided, single submitter"
15890,chr11,47351356,T,C,hg38,Q14896,ENST00000545968.6,T59A,0.096,likely_benign,likely_pathogenic,no assertion criteria provided
11936,chr11,47346207,C,T,hg38,Q14896,ENST00000545968.6,A364T,0.1072,likely_benign,likely_pathogenic,"criteria provided, multiple submitters, no con..."
8846,chr11,47341991,C,T,hg38,Q14896,ENST00000545968.6,R597Q,0.1985,likely_benign,likely_pathogenic,"criteria provided, multiple submitters, no con..."
11694,chr11,47343571,G,A,hg38,Q14896,ENST00000545968.6,R382W,0.3417,ambiguous,likely_benign,"criteria provided, multiple submitters, no con..."


In [157]:
#Preprocess EVE dataframe


eve_dfs = [eve_myh7_df, eve_mybpc3_df]
adj_eve_dfs = []
for eve_df in eve_dfs:
    adj_eve_df = pd.DataFrame()

    adj_eve_df = eve_df.dropna(subset="EVE_classes_75_pct_retained_ASM")

    #Standardize labels with those used above
    adj_eve_df["EVE_classes_75_pct_retained_ASM"] = adj_eve_df["EVE_classes_75_pct_retained_ASM"].apply(lambda x: "ambiguous" if x == "Uncertain" else ("likely_benign" if "Benign" in x or "benign" in x else "likely_pathogenic"))

    #Add mutation label to match those of ClinVar
    eve_df["protein_variant"] = eve_df["wt_aa"] + eve_df["position"].apply(lambda x: str(x)) + eve_df["mt_aa"]
    adj_eve_df["protein_variant"] = adj_eve_df["wt_aa"] + eve_df["position"].apply(lambda x: str(x)) + eve_df["mt_aa"]

    adj_eve_dfs.append(adj_eve_df)


eve_myh7_df_adj = adj_eve_dfs[0]
eve_mybpc3_df_adj = adj_eve_dfs[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_eve_df["EVE_classes_75_pct_retained_ASM"] = adj_eve_df["EVE_classes_75_pct_retained_ASM"].apply(lambda x: "ambiguous" if x == "Uncertain" else ("likely_benign" if "Benign" in x or "benign" in x else "likely_pathogenic"))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_eve_df["protein_variant"] = adj_eve_df["wt_aa"] + eve_df["position"].apply(lambda x: str(x)) + eve_df["mt_aa"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_eve_df["EVE_classes_75_pct_retained_ASM"] = adj_eve_df["EVE_classes_75_pct_retained_ASM"].apply(lambda x: "ambiguous" if x == "Uncertain" else ("likely_benign" if "Benign" in x or "benign" in x else "likely_pathogenic"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [158]:
#Find mismatches between EVE and ClinVar

for cv_df, eve_df, gene in [(cv_myh7_df, eve_myh7_df_adj, "eve_myh7"), (cv_mybpc3_df, eve_mybpc3_df_adj, "eve_mybpc3")]:

    mismatch_df = pd.DataFrame()
    for i in range(len(cv_df)):
        matched_eve_pred = eve_df[eve_df["protein_variant"] == cv_df.iloc[i]["Protein change"]]

        if not matched_eve_pred.empty:
            matched_eve_pred = matched_eve_pred.iloc[0]

            exp_class = cv_df.iloc[i]["label"]
            status = cv_df.iloc[i]["Review status"]
            pred_class = matched_eve_pred["EVE_classes_75_pct_retained_ASM"]

            if exp_class != "ambiguous":
                if exp_class != pred_class:

                    matched_eve_pred["ClinVar label"] = exp_class
                    matched_eve_pred["ClinVar status"] = status

                    # print(matched_eve_pred)
                    # print("flow")

                    mismatch_df = mismatch_df.append(matched_eve_pred)
    
    mismatch_dict[gene] = mismatch_df[["protein_variant", "EVE_scores_ASM", "EVE_classes_75_pct_retained_ASM", "ClinVar label", "uncertainty_ASM", "ClinVar status"]]

  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve_pred)
  mismatch_df = mismatch_df.append(matched_eve

In [159]:
print(len(mismatch_dict["eve_myh7"]))
mismatch_dict["eve_myh7"].head(50)

153


Unnamed: 0,protein_variant,EVE_scores_ASM,EVE_classes_75_pct_retained_ASM,ClinVar label,uncertainty_ASM,ClinVar status
10720,E537A,0.613361,ambiguous,likely_pathogenic,0.667221,"criteria provided, single submitter"
6033,L302Q,0.350133,likely_benign,likely_pathogenic,0.647529,"criteria provided, single submitter"
13769,V689L,0.911706,likely_pathogenic,likely_benign,0.298573,"criteria provided, single submitter"
3997,A200V,0.424342,ambiguous,likely_pathogenic,0.681655,"criteria provided, single submitter"
12017,L601V,0.364945,ambiguous,likely_pathogenic,0.65621,"criteria provided, single submitter"
12060,T604A,0.344009,likely_benign,likely_pathogenic,0.643659,"criteria provided, single submitter"
35733,E1787Q,0.619193,ambiguous,likely_pathogenic,0.664458,"criteria provided, single submitter"
8235,T412S,0.357886,likely_benign,likely_pathogenic,0.652192,"criteria provided, single submitter"
4333,E217Q,0.622046,ambiguous,likely_pathogenic,0.663054,"criteria provided, single submitter"
2954,S148R,0.525667,ambiguous,likely_pathogenic,0.691829,"criteria provided, single submitter"


In [160]:
print(len(mismatch_dict["eve_mybpc3"]))
mismatch_dict["eve_mybpc3"].head(50)

19


Unnamed: 0,protein_variant,EVE_scores_ASM,EVE_classes_75_pct_retained_ASM,ClinVar label,uncertainty_ASM,ClinVar status
13190,V660M,0.394654,ambiguous,likely_pathogenic,0.670785,"criteria provided, single submitter"
22876,M1144T,0.432483,ambiguous,likely_benign,0.684002,"criteria provided, single submitter"
10029,R502L,0.616226,ambiguous,likely_pathogenic,0.665881,"criteria provided, single submitter"
13065,R654G,0.124543,likely_benign,likely_pathogenic,0.375881,no assertion criteria provided
25264,C1264F,0.392013,ambiguous,likely_pathogenic,0.66964,"criteria provided, single submitter"
15222,V762D,0.271664,likely_benign,likely_pathogenic,0.584906,"criteria provided, multiple submitters, no con..."
7276,A364T,0.452626,ambiguous,likely_pathogenic,0.688652,"criteria provided, multiple submitters, no con..."
20038,R1002W,0.677674,likely_pathogenic,likely_benign,0.62861,"criteria provided, multiple submitters, no con..."
7638,R382W,0.466912,ambiguous,likely_benign,0.690956,"criteria provided, multiple submitters, no con..."
22746,R1138H,0.842475,likely_pathogenic,likely_benign,0.435542,"criteria provided, multiple submitters, no con..."


In [161]:
# Put all variants with at least one mismatch in table

#Start by standardizing column names

eve_gene_dict = {"eve_myh7": eve_myh7_df, "eve_mybpc3": eve_mybpc3_df}
am_gene_dict = {"am_myh7": am_myh7_df, "am_mybpc3": am_mybpc3_df}

print(len(mismatch_dict))

print(mismatch_dict.keys())

for key in mismatch_dict.keys():
    if "am" in key:
        mismatch_dict[key].drop(columns=["CHROM", "POS", "REF", "ALT", "genome", "uniprot_id", "transcript_id"], inplace=True)
        mismatch_dict[key]["EVE_scores_ASM"] = eve_gene_dict[key.replace("am", "eve")][eve_gene_dict[key.replace("am", "eve")]["protein_variant"].isin(mismatch_dict[key]["protein_variant"].values)]["EVE_scores_ASM"].values
        mismatch_dict[key]["EVE_classes_75_pct_retained_ASM"] = eve_gene_dict[key.replace("am", "eve")][eve_gene_dict[key.replace("am", "eve")]["protein_variant"].isin(mismatch_dict[key]["protein_variant"].values)]["EVE_classes_75_pct_retained_ASM"].values                      
    else:
        mismatch_dict[key]["am_pathogenicity"] = am_gene_dict[key.replace("eve", "am")][am_gene_dict[key.replace("eve", "am")]["protein_variant"].isin(mismatch_dict[key]["protein_variant"].values)]["am_pathogenicity"].values
        mismatch_dict[key]["am_class"] = am_gene_dict[key.replace("eve", "am")][am_gene_dict[key.replace("eve", "am")]["protein_variant"].isin(mismatch_dict[key]["protein_variant"].values)]["am_class"].values

    print(eve_gene_dict[key.replace("am", "eve")][eve_gene_dict[key.replace("am", "eve")]["protein_variant"].isin(mismatch_dict[key]["protein_variant"].values)]["EVE_scores_ASM"])
    print(eve_gene_dict[key.replace("am", "eve")][eve_gene_dict[key.replace("am", "eve")]["protein_variant"].isin(mismatch_dict[key]["protein_variant"].values)]["EVE_classes_75_pct_retained_ASM"])
    print(mismatch_dict[key])
    break

concat_df = pd.DataFrame()
for key in mismatch_dict.keys():
    concat_df = pd.concat([concat_df, mismatch_dict[key]])

print(len(concat_df))

concat_df.to_csv(path_or_buf="three_way_comp.csv", index=False)
concat_df.head(50)



In [199]:
#Find accuracy of AlphaMissense and EVE for both each model-gene pair_scores = {}
accuracy_scores = {}
micro_avg_precision_scores = {}
micro_avg_recall_scores = {}
macro_avg_precision_scores = {}
macro_avg_recall_scores = {}
count = 0

for cv_df, pred_df, model_gene_pair in [(cv_myh7_df, am_myh7_df, "am_myh7"), (cv_mybpc3_df, am_mybpc3_df, "am_mybpc3"),
                           (cv_myh7_df, eve_myh7_df_adj, "eve_myh7"), (cv_mybpc3_df, eve_mybpc3_df_adj, "eve_mybpc3")]:

    # mismatch_df = pd.DataFrame()
    matched_count = 0
    mismatch_count = 0
    newly_class = 0

    #Define dictionary to hold values needed to find the precision and recall for each model-gene pair
    metrics_dict = {"likely_pathogenic" : {"true_pos": 0, "false_pos": 0, "false_neg": 0},
                    "likely_benign_or_ambi" : {"true_pos": 0, "false_pos": 0, "false_neg": 0}}
    print("Number of ClinVar: " + str(len(cv_df)))
    for i in range(len(cv_df)):
        matched_pred = pred_df[pred_df["protein_variant"] == cv_df.iloc[i]["Protein change"]]

        #Drops frameshift mutations and NaNs

        if not matched_pred.empty:
            # Database may have duplicates, only consider one row
            matched_pred = matched_pred.iloc[0]

            #Segment to find accuracy
            exp_class = cv_df.iloc[i]["label"]
            if count <= 1:
                pred_class = matched_pred["am_class"]
            else:
                pred_class = matched_pred["EVE_classes_75_pct_retained_ASM"]

            # print("Expected Class :" + str(exp_class))
            # print("Predicted Class :" + str(pred_class))
            if exp_class != "ambiguous":

                if exp_class == pred_class:
                    matched_count += 1
                else:
                    mismatch_count += 1
            else:

                #New prediction made by model, mark is as positive
                matched_count += 1
                newly_class += 1
            


            #Segment to find metrics for class (to derive prec. and recall for each model-gene pair)
            curr_class = exp_class
            
            if curr_class != "ambiguous":
                
                #Use combination key if class is likely_benign or ambiguous
                if curr_class == "likely_benign":
                    curr_class = "likely_benign_or_ambi"
                
                if pred_class == "likely_benign" or pred_class == "ambiguous":
                    pred_class = "likely_benign_or_ambi"

                #True positive if class is predicted correctly
                if pred_class == curr_class:
                    metrics_dict[curr_class]["true_pos"] += 1
                
                else:

                    # False negative for this class
                    metrics_dict[curr_class]["false_neg"] += 1

                    #False positive for the predicted class
                    metrics_dict[pred_class]["false_pos"] += 1

    accuracy_scores[model_gene_pair] = matched_count/(mismatch_count+matched_count)
    print("Mismatch Count: " + str(mismatch_count))
    print("Match Count: " + str(matched_count))
    print("Newly Classified: " + str(newly_class))

    print(metrics_dict)

    precision_list = [f"Precision for {x}: " + str(metrics_dict[x]["true_pos"]/(metrics_dict[x]["true_pos"]+metrics_dict[x]["false_pos"])) for x in metrics_dict.keys()]
    print(f"PRECISION LIST FOR {model_gene_pair}: " + str(precision_list))
    # macro_avg_precision_scores[model_gene_pair] = sum(precision_list)/len(metrics_dict)
    
    recall_list = [f"Recall for {x}: " + str(metrics_dict[x]["true_pos"]/(metrics_dict[x]["true_pos"]+metrics_dict[x]["false_neg"])) for x in metrics_dict.keys()]
    print(f"RECALL LIST FOR {model_gene_pair}: " + str(recall_list))

    # macro_avg_recall_scores[model_gene_pair] = sum(recall_list)/len(metrics_dict)

    overall_tp = sum([metrics_dict[x]["true_pos"] for x in metrics_dict.keys()])
    overall_fp = sum([metrics_dict[x]["false_pos"] for x in metrics_dict.keys()])
    overall_fn = sum([metrics_dict[x]["false_neg"] for x in metrics_dict.keys()])

    micro_avg_precision_scores[model_gene_pair] = overall_tp/(overall_tp + overall_fp)
    micro_avg_recall_scores[model_gene_pair] = overall_tp/(overall_tp + overall_fn)

    count += 1
print(accuracy_scores)
# print(micro_avg_precision_scores)
# print(micro_avg_recall_scores)
# print(macro_avg_precision_scores)
# print(macro_avg_recall_scores)


'''
QUESTIONS
---Should we include the newly classified variants in the performacy metrics or not?
---Should the micro- or macro- average classified variants be used for analysis (or both)?
'''


    

Number of ClinVar: 3732
Mismatch Count: 61
Match Count: 1924
Newly Classified: 1642
{'likely_pathogenic': {'true_pos': 271, 'false_pos': 5, 'false_neg': 54}, 'likely_benign_or_ambi': {'true_pos': 13, 'false_pos': 54, 'false_neg': 5}}
PRECISION LIST FOR am_myh7: ['Precision for likely_pathogenic: 0.9818840579710145', 'Precision for likely_benign_or_ambi: 0.19402985074626866']
RECALL LIST FOR am_myh7: ['Recall for likely_pathogenic: 0.8338461538461538', 'Recall for likely_benign_or_ambi: 0.7222222222222222']
Number of ClinVar: 3036
Mismatch Count: 20
Match Count: 1149
Newly Classified: 1102
{'likely_pathogenic': {'true_pos': 22, 'false_pos': 1, 'false_neg': 17}, 'likely_benign_or_ambi': {'true_pos': 27, 'false_pos': 17, 'false_neg': 1}}
PRECISION LIST FOR am_mybpc3: ['Precision for likely_pathogenic: 0.9565217391304348', 'Precision for likely_benign_or_ambi: 0.6136363636363636']
RECALL LIST FOR am_mybpc3: ['Recall for likely_pathogenic: 0.5641025641025641', 'Recall for likely_benign_or_a

'\nQUESTIONS\n---Should we include the newly classified variants in the performacy metrics or not?\n---Should the micro- or macro- average classified variants be used for analysis (or both)?\n'

In [None]:
# Get precision and recall

'''Micro-avergaing both of the statistics will probably give a better view of the performance because we want and idea of how well the models
 do on the variants as individuals more than each of the categories'''



#Then write up a page (plain english) about findings
# And here consider the level of evidence
# AFter this,building a classifier

# Relearn XGBoost and hyperparameter tuning

['Q734E']
