In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
sample_df = pd.read_csv('/content/drive/MyDrive/somatic_rules/cinli.csv')
cancer_hotspots_df = pd.read_csv("/content/drive/MyDrive/somatic_rules/cancer_hotspots_revised.csv")

sample_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,...,dbscSNV_ADA_SCORE,dbscSNV_RF_SCORE,MCAP,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,REVEL,rules
0,0,0,12,25209871,25209871,C,T,exonic,KRAS,.,...,.,.,0.568996615134,54295,Rasopathy|not_specified,MONDO:MONDO:0021060\x2cMedGen:CN166718\x2cOrph...,criteria_provided\x2c_multiple_submitters\x2c_...,Uncertain_significance,0.745,OP4
1,1,1,12,25227330,25227330,C,A,exonic,KRAS,.,...,.,.,0.123105525756,432423,Noonan_syndrome_3|Rasopathy,MONDO:MONDO:0012371\x2cMedGen:C1860991\x2cOMIM...,reviewed_by_expert_panel,Likely_pathogenic,0.6,OP4
2,2,2,12,25227343,25227343,G,T,exonic,KRAS,.,...,.,.,0.0655137715661,175715,Non-small_cell_lung_cancer|Neoplasm_of_the_thy...,Human_Phenotype_Ontology:HP:0030358\x2cMONDO:M...,criteria_provided\x2c_single_submitter,Pathogenic,0.63,OP4
3,3,3,12,25245350,25245350,C,G,exonic,KRAS,.,...,.,.,0.0910971912578,54289,Multiple_myeloma|Lung_adenocarcinoma|Non-small...,Human_Phenotype_Ontology:HP:0006775\x2cMONDO:M...,no_assertion_criteria_provided,Conflicting_interpretations_of_pathogenicity,0.841,OP4
4,4,4,13,28018483,28018483,T,C,exonic,FLT3,.,...,.,.,0.293007099215,362981,Acute_myeloid_leukemia,Human_Phenotype_Ontology:HP:0001914\x2cHuman_P...,no_assertion_criteria_provided,Likely_pathogenic,0.925,OP4


In [4]:

import re
import pandas as pd



def convert_float(value):
    try:
        return float(value)
    except:
        return False

def prepare_data(sample_df, cancer_hotspots_df):
    sample_df['id'] = sample_df.apply(lambda x: x['AAChange.refGene'].split(':')[0]+'_'+x['AAChange.refGene'].split(':p.')[-1], axis=1)
    sample_df = pd.merge(sample_df, cancer_hotspots_df, left_on=["id"], right_on=["id"], how="left")
    return sample_df

class Rules:
    def __init__(self, row):
        self.variant = row
        self.pop_rules()
        self.in_silico_rules()
        self.cancer_hotspot_rules()

    def pop_rules(self):
        pop_columns = [i for i in self.variant.index if "ExAC" in i or "gnomAD" in i]
        val = self.variant[pop_columns].replace(".","0.0").astype(float)
        if (val >= 0.05).any():
            self.SBVS1 = True
            self.SBS1 = False
            self.OP4 =  False
        elif (val >= 0.01).any():
            self.SBVS1 = False
            self.SBS1 = True
            self.OP4 =  False
        else:
            self.SBVS1 = False
            self.SBS1 = False
            self.OP4 =  True


    def in_silico_rules(self):
        pathogen_list = 0
        benign_list = 0

        if self.variant["FATHMM_pred"] == "T":
            benign_list += 1
        elif self.variant["FATHMM_pred"] == "D":
            pathogen_list += 1
        else:
            pass

        if convert_float(self.variant["CADD_phred"]) <= 23.2:
            benign_list += 1
        elif convert_float(self.variant["CADD_phred"]) >= 25.6:
            pathogen_list += 1
        else:
            pass

        if convert_float(self.variant["DANN_score"]) < 0.974:
            benign_list += 1
        elif convert_float(self.variant["DANN_score"]) >= 0.995:
            pathogen_list +=1
        else:
            pass

        if convert_float(self.variant["REVEL"]) <= 0.25:
            benign_list += 1
        elif convert_float(self.variant["REVEL"]) >= 0.75:
            pathogen_list += 1
        else:
            pass

        if self.variant["MetaLR_pred"] == "T":
            benign_list += 1
        elif self.variant["MetaLR_pred"] == "D":
            pathogen_list += 1
        else:
            pass

        if pathogen_list > benign_list:
            self.OP1 = True
            self.SBP1 = False
            self.SBP2 = False
        elif benign_list > pathogen_list:
            self.OP1 = False
            self.SBP1 = True
            self.SBP2 = False
        else:
            if self.variant["ExonicFunc.refGene"] == "synonymous SNV":
                self.OP1 = False
                self.SBP1 = False
                self.SBP2 = True
            else:
                self.OP1 = False
                self.SBP1 = False
                self.SBP2 = False

    def cancer_hotspot_rules(self):


        if int(self.variant["total_variant_count"]) >= 10:
            if int(self.variant["Mutation_Count"]) >= 50:
                self.OS3 = True
                self.OM3 = False
                self.OP3 = False
            else:
                self.OS3 = False
                self.OM3 = True
                self.OP3 = False
        elif int(self.variant["total_variant_count"]) > 0:
                self.OS3 = False
                self.OM3 = False
                self.OP3 = True
        else:
            self.OS3 = False
            self.OM3 = False
            self.OP3 = False

    def print_pop_rules(self):
        print(f"SBVS1:{self.SBVS1}")
        print(f"OP4:{self.OP4}")
        print(f"SBS1:{self.SBS1}")
        print(f"OP1:{self.OP1}")
        print(f"SBP1:{self.SBP1}")
        print(f"SBP2:{self.SBP2}")
        print(f"OS3:{self.OS3}")
        print(f"OM3:{self.OM3}")
        print(f"OP3:{self.OP3}")


sample_df_ = prepare_data(sample_df, cancer_hotspots_df)
rule_object = Rules(sample_df_.iloc[18])


In [5]:
rule_object.print_pop_rules()

SBVS1:False
OP4:True
SBS1:False
OP1:True
SBP1:False
SBP2:False
OS3:True
OM3:False
OP3:False
