In [5]:
from Bio import Entrez, SeqIO

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query rsid to get gene and protein information
    Entrez.email = "liweihao0401@gmail.com"
    handle = Entrez.efetch(db="snp", id=rsid, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    gene_name = record.annotations["gene"]
    protein_accession = record.features[0].qualifiers["protein_id"][0]

    # Step 2: Query transcript location to get gene and protein positions
    transcript_accession = transcript_location.split(":")[0]
    position = int(transcript_location.split(":")[1])

    handle = Entrez.efetch(db="nucleotide", id=transcript_accession, rettype="gb", retmode="text")
    transcript_record = SeqIO.read(handle, "genbank")

    for feature in transcript_record.features:
        if feature.type == "CDS":
            cds_start = feature.location.start.position
            cds_end = feature.location.end.position
            if cds_start <= position <= cds_end:
                gene_position = feature.qualifiers["gene"][0]
                protein_position = position - cds_start + 1
                return gene_name, gene_position, protein_accession, protein_position

    return None

# Example usage
rsid = "rs10011796"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    gene_name, gene_position, protein_accession, protein_position = result
    print(f"Gene Name: {gene_name}")
    print(f"Gene Position: {gene_position}")
    print(f"Protein Accession: {protein_accession}")
    print(f"Protein Position: {protein_position}")
else:
    print("No matching information found.")


ValueError: No records found in handle

In [1]:
import requests

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query rsid to get gene and protein information
    url = f"https://rest.ensembl.org/variation/human/{rsid}"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    
    if response.status_code == 200:
        data = response.json()
        print(data['mappings'][0])
        gene_name = data['mappings'][0]['gene_symbol']
        protein_id = data['mappings'][0]['protein_id']
    else:
        print(f"Error: Unable to fetch information for rsid {rsid}")
        return None

    # Step 2: Parse transcript location to get gene and protein positions
    transcript_accession, position = transcript_location.split(":")
    position = int(position)

    return gene_name, transcript_accession, position, protein_id

# Example usage
rsid = "rs1000002"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    gene_name, transcript_accession, position, protein_id = result
    print(f"Gene Name: {gene_name}")
    print(f"Transcript Accession: {transcript_accession}")
    print(f"Position: {position}")
    print(f"Protein ID: {protein_id}")


{'location': '3:183917980-183917980', 'start': 183917980, 'strand': 1, 'ancestral_allele': 'C', 'allele_string': 'C/T', 'assembly_name': 'GRCh38', 'seq_region_name': '3', 'coord_system': 'chromosome', 'end': 183917980}


KeyError: 'gene_symbol'

In [2]:
import requests

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query rsid to get gene and protein information
    url = f"https://rest.ensembl.org/variation/human/{rsid}"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    
    if response.status_code == 200:
        data = response.json()
        if 'mappings' in data and len(data['mappings']) > 0:
            transcript_accession = data['mappings'][0]['transcript_id']
            position = data['mappings'][0]['start']
        else:
            print(f"Error: Unable to fetch information for rsid {rsid}")
            return None
    else:
        print(f"Error: Unable to fetch information for rsid {rsid}")
        return None

    # Step 2: Parse transcript location to get gene and protein positions
    transcript_accession, position = transcript_location.split(":")
    position = int(position)

    return transcript_accession, position

# Example usage
rsid = "rs1000002"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    transcript_accession, position = result
    print(f"Transcript Accession: {transcript_accession}")
    print(f"Position: {position}")


KeyError: 'transcript_id'

In [1]:
import requests

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query transcript location to get gene and protein information
    transcript_accession, position = transcript_location.split(":")
    position = int(position)

    url = f"https://rest.ensembl.org/map/human/{transcript_accession}/{position}..{position}/GRCh38"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    
    if response.status_code == 200:
        data = response.json()
        if 'mappings' in data and len(data['mappings']) > 0:
            gene_name = data['mappings'][0]['external_name']
            protein_id = data['mappings'][0]['protein_id']
        else:
            print(f"Error: Unable to fetch information for rsid {rsid}")
            return None
    else:
        print(f"Error: Unable to fetch information for rsid {rsid}")
        return None

    return gene_name, transcript_accession, position, protein_id

# Example usage
rsid = "rs1000002"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    gene_name, transcript_accession, position, protein_id = result
    print(f"Gene Name: {gene_name}")
    print(f"Transcript Accession: {transcript_accession}")
    print(f"Position: {position}")
    print(f"Protein ID: {protein_id}")


Error: Unable to fetch information for rsid rs1000002


In [4]:
import requests
gene = '1HPV'
url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene
all_fastas = requests.get(url)
all_fastas

<Response [200]>

In [20]:
import requests

gene = 'ATK1'
url = 'https://rest.uniprot.org/uniprotkb/search?format=json&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene

response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    # 处理返回的JSON数据

    print(data['results'][0])
    # print(data['results'][0]['primaryAccession'])
    # for entry in data:
    #     protein_id = entry['accession']
    #     protein_name = entry['protein']['recommendedName']['fullName']
    #     print(f'Protein ID: {protein_id}, Protein Name: {protein_name}')
else:
    print(f'Error: Unable to fetch data. Status code {response.status_code}')


IndexError: list index out of range

In [34]:
import pandas as pd

all_data_df = pd.read_csv('../middlefile/gdsc_all_table.tsv', sep = '\t')
symbol_list = all_data_df['gene symbol'].unique()
new_data_df = all_data_df.copy
# for i in range(len(symbol_list)):
for i in range(3):
    gene = symbol_list[i]
    url = 'https://rest.uniprot.org/uniprotkb/search?format=json&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene
    response = requests.get(url)
    data = response.json()
    print(gene)
    print(data['results'][0]['primaryAccession'])
    new_data_df.loc[new_data_df['gene symbol'] == gene]['uniprot id'] = data['results'][0]['primaryAccession']

ABL1
P00519


AttributeError: 'function' object has no attribute 'loc'

In [25]:
all_data_df.head()

Unnamed: 0.1,Unnamed: 0,data source,chemical name,gene symbol,variant,uniprot id,ddg
0,0,gdsc,Nilotinib,ABL1,R608H,P00519,0.365149
1,1,gdsc,Nilotinib,ABL1,A1045T,P00519,0.62003
2,2,gdsc,Nilotinib,ABL1,V587M,P00519,0.146038
3,3,gdsc,Nilotinib,ABL1,P460L,P00519,0.494474
4,4,gdsc,Nilotinib,ABL1,G1079D,P00519,1.597725


In [32]:
all_data_df.loc[all_data_df['gene symbol'] == 'AKT1']

Unnamed: 0.1,Unnamed: 0,data source,chemical name,gene symbol,variant,uniprot id,ddg
58,58,gdsc,AKT inhibitor VIII,AKT1,A230T+Q59*,Q96B36,-0.655527
59,59,gdsc,AT13148,AKT1,R406H,Q96B36,-0.811391
60,60,gdsc,AT13148,AKT1,Q59*,Q96B36,0.500563
61,61,gdsc,AT13148,AKT1,S266L,Q96B36,0.494086
62,62,gdsc,AT13148,AKT1,Y272C,Q96B36,-0.057076
63,63,gdsc,AT13148,AKT1,R86H,Q96B36,0.220549
64,64,gdsc,AT13148,AKT1,E17K,Q96B36,0.971729
65,65,gdsc,AT13148,AKT1,R200H,Q96B36,-0.911317
66,66,gdsc,AT13148,AKT1,G478S,Q96B36,0.566527
67,67,gdsc,AT13148,AKT1,E49K+E17K,Q96B36,-0.033916
