In [1]:
import pandas as pd

In [2]:
cosmic = pd.read_csv('cosmic_census.csv',index_col=0)

In [3]:
cosmic_tsg = cosmic.loc[cosmic['ONC_TSG']=='TSG']

In [4]:
cosmic_tsg = cosmic_tsg.append(cosmic.loc[cosmic['GENE_NAME']=='TP53']).reset_index(drop=True)

In [5]:
cosmic_tsg = cosmic_tsg.dropna(subset=['Mutation genome position GRCh38'])

In [6]:
cosmic_tsg = cosmic_tsg.drop_duplicates(subset=['GENE_NAME','Mutation genome position GRCh38'])

In [7]:
all_genes = sorted(cosmic_tsg['GENE_NAME'].unique())

In [37]:
import requests

def get_gene_location(gene_symbol):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    search_url = base_url + "esearch.fcgi"
    summary_url = base_url + "esummary.fcgi"

    # Perform a search to retrieve the Gene ID for the given gene symbol
    search_params = {
        "db": "gene",
        "term": gene_symbol + "[Gene Name] AND Homo sapiens[Organism] AND GRCh38[Assembly]",
        "retmode": "json"
    }
    search_response = requests.get(search_url, params=search_params)
    gene_id = search_response.json()["esearchresult"]["idlist"][0]

    # Retrieve summary information for the Gene ID to get the gene location
    summary_params = {
        "db": "gene",
        "id": gene_id,
        "retmode": "json"
    }
    summary_response = requests.get(summary_url, params=summary_params)
    gene_summary = summary_response.json()["result"][gene_id]
    #return gene_summary
    
    # Extract and return gene location information
    chromosome = gene_summary["chromosome"]
    start = gene_summary["genomicinfo"][0]["chrstart"]
    end = gene_summary["genomicinfo"][0]["chrstop"]
    return chromosome, start, end

# Example usage

gene_pos = []

for i_g in range(len(all_genes)):    
    gene_symbol = all_genes[i_g] # Example gene symbol
    chromosome, start, end = get_gene_location(gene_symbol)
    gene_pos.append((gene_symbol,chromosome,start,end))

In [39]:
pd_gene = pd.DataFrame(gene_pos,columns=['gene','chrom','pos_start','pos_stop'])

In [41]:
pd_gene.to_csv('gene_position.csv')

In [42]:
pd_gene

Unnamed: 0,gene,chrom,pos_start,pos_stop
0,ACVR1B,12,51951698,51997077
1,ACVR2A,2,147844516,147930821
2,AMER1,X,64205707,64185116
3,APC,5,112707497,112846238
4,ARHGAP35,19,46860996,47005076
...,...,...,...,...
181,XPC,3,14178600,14145146
182,ZFHX3,16,73891929,72782884
183,ZMYM3,X,71255289,71239623
184,ZNRF3,22,28883571,29057487


In [27]:
pd_tsg_larger = pd.read_csv('larger_tsg_range.csv',index_col=None,sep=';')

In [28]:
pd_tsg_larger

Unnamed: 0,Gene,Chromosome,Start,End
0,ACVR1B,chr12,53413594,53455780
1,ACVR2A,chr2,148954290,149031473
2,AMER1,chrX,70438121,70477966
3,APC,chr5,112073701,112219330
4,ARHGAP35,chr19,37767762,37829217
...,...,...,...,...
181,XPC,chr3,97941070,97956753
182,ZFHX3,chr16,68569758,68612285
183,ZMYM3,chr1,159929703,159962063
184,ZNRF3,chr22,23033545,23042018


In [2]:
pd_driver = pd.read_csv('../CancerMutationCensus_AllData_v99_GRCh38.tsv',sep='\t',low_memory=False)

In [17]:
pd_driver.columns

Index(['GENE_NAME', 'ACCESSION_NUMBER', 'ONC_TSG', 'CGC_TIER', 'MUTATION_URL',
       'LEGACY_MUTATION_ID', 'Mutation CDS', 'Mutation AA', 'AA_MUT_START',
       'AA_MUT_STOP', 'SHARED_AA', 'GENOMIC_WT_ALLELE_SEQ',
       'GENOMIC_MUT_ALLELE_SEQ', 'AA_WT_ALLELE_SEQ', 'AA_MUT_ALLELE_SEQ',
       'Mutation Description CDS', 'Mutation Description AA',
       'ONTOLOGY_MUTATION_CODE', 'GENOMIC_MUTATION_ID',
       'Mutation genome position GRCh37', 'Mutation genome position GRCh38',
       'COSMIC_SAMPLE_TESTED', 'COSMIC_SAMPLE_MUTATED', 'DISEASE',
       'WGS_DISEASE', 'EXAC_AF', 'EXAC_AFR_AF', 'EXAC_AMR_AF', 'EXAC_ADJ_AF',
       'EXAC_EAS_AF', 'EXAC_FIN_AF', 'EXAC_NFE_AF', 'EXAC_SAS_AF',
       'GNOMAD_EXOMES_AF', 'GNOMAD_EXOMES_AFR_AF', 'GNOMAD_EXOMES_AMR_AF',
       'GNOMAD_EXOMES_ASJ_AF', 'GNOMAD_EXOMES_EAS_AF', 'GNOMAD_EXOMES_FIN_AF',
       'GNOMAD_EXOMES_NFE_AF', 'GNOMAD_EXOMES_SAS_AF', 'GNOMAD_GENOMES_AF',
       'GNOMAD_GENOMES_AFR_AF', 'GNOMAD_GENOMES_AMI_AF',
       'GNOMAD_GE

In [20]:
pd_tier = pd_driver.loc[pd_driver['MUTATION_SIGNIFICANCE_TIER'].isin(['1','2','3'])]

In [21]:
pd_tier = pd_tier.loc[~pd_tier['ONC_TSG'].isna()]

In [23]:
pd_tier = pd_tier[['GENE_NAME','ONC_TSG','CGC_TIER','MUTATION_SIGNIFICANCE_TIER','Mutation genome position GRCh37', 'Mutation genome position GRCh38']]

In [25]:
pd_tier.to_csv('all_tier.csv')

In [26]:
pd_driver_small = pd_driver[['GENE_NAME','ONC_TSG','CGC_TIER','MUTATION_SIGNIFICANCE_TIER','Mutation genome position GRCh37', 'Mutation genome position GRCh38']]

In [29]:
pd_driver_small = pd_driver_small.loc[~pd_driver_small['ONC_TSG'].isna()]

In [31]:
pd_driver_small = pd_driver_small.loc[pd_driver_small['MUTATION_SIGNIFICANCE_TIER'].isin(['1','2','3'])]

In [33]:
pd_driver_small = pd_driver_small.reset_index(drop=True)

In [35]:
pd_driver_small.to_csv('cosmic_census.csv')

In [4]:
pd_driver_tsgonc = pd_driver.loc[pd_driver['ONC_TSG'].isin(['oncogene, TSG, fusion'])]

In [7]:
pd_driver_tsgonc

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,ONC_TSG,CGC_TIER,MUTATION_URL,LEGACY_MUTATION_ID,Mutation CDS,Mutation AA,AA_MUT_START,AA_MUT_STOP,...,GNOMAD_GENOMES_MID_AF,GNOMAD_GENOMES_NFE_AF,GNOMAD_GENOMES_SAS_AF,CLINVAR_CLNSIG,CLINVAR_TRAIT,GERP++_RS,MIN_SIFT_SCORE,MIN_SIFT_PRED,DNDS_DISEASE_QVAL_SIG,MUTATION_SIGNIFICANCE_TIER
186121,ESR1,ENST00000440973.1,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6959903,c.94A>C,p.K32Q,32,32,...,,,,,,5.06,0.000,D,,Other
186122,ESR1,ENST00000440973.1,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM10000648,c.1567G>A,p.E523K,523,523,...,,,,,,5.31,0.001,D,,Other
186123,ESR1,ENST00000440973.1,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM8552626,c.388T>C,p.Y130H,130,130,...,,,,,,4.89,0.000,D,,Other
186124,ESR1,ENST00000440973.1,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM10252331,c.1671_1672delinsTT,p.A558S,558,558,...,,,,,,,1.000,,,Other
186125,ESR1,ENST00000440973.1,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM10267424,c.1756G>T,p.G586W,586,586,...,,,,,,3.53,0.012,D,,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4936271,CREBBP,ENST00000262367.5,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6635352,c.4111G>T,p.V1371F,1371,1371,...,,,,,,4.31,0.017,D,,Other
4936272,CREBBP,ENST00000262367.5,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6983531,c.6760C>T,p.L2254F,2254,2254,...,,,,,,4.07,0.061,T,,Other
4936273,CREBBP,ENST00000262367.5,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6929705,c.2116G>A,p.G706R,706,706,...,,,,,,4.68,0.011,D,,Other
4936274,CREBBP,ENST00000262367.5,"oncogene, TSG, fusion",1.0,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM88752,c.4496T>C,p.L1499P,1499,1499,...,,,,,,5.25,0.000,D,,Other


In [6]:
pd_driver_tsgonc['GENE_NAME'].unique()

array(['ESR1', 'ARNT', 'NFKB2', 'RUNX1', 'FOXO1', 'TET1', 'TCF3', 'FOXO4',
       'NTRK1', 'BIRC3', 'CIC', 'HOXA11', 'FOXO3', 'TRIM24', 'CBL',
       'TP53', 'IKZF3', 'MRTFA', 'ELF4', 'HOXA9', 'RUNX1T1', 'TBL1XR1',
       'WT1', 'BCL11B', 'SUZ12', 'STAT5B', 'PRKAR1A', 'IRF4', 'NOTCH1',
       'PAX5', 'CREBBP'], dtype=object)