In [5]:
from Bio import Entrez, SeqIO

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query rsid to get gene and protein information
    Entrez.email = "liweihao0401@gmail.com"
    handle = Entrez.efetch(db="snp", id=rsid, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    gene_name = record.annotations["gene"]
    protein_accession = record.features[0].qualifiers["protein_id"][0]

    # Step 2: Query transcript location to get gene and protein positions
    transcript_accession = transcript_location.split(":")[0]
    position = int(transcript_location.split(":")[1])

    handle = Entrez.efetch(db="nucleotide", id=transcript_accession, rettype="gb", retmode="text")
    transcript_record = SeqIO.read(handle, "genbank")

    for feature in transcript_record.features:
        if feature.type == "CDS":
            cds_start = feature.location.start.position
            cds_end = feature.location.end.position
            if cds_start <= position <= cds_end:
                gene_position = feature.qualifiers["gene"][0]
                protein_position = position - cds_start + 1
                return gene_name, gene_position, protein_accession, protein_position

    return None

# Example usage
rsid = "rs10011796"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    gene_name, gene_position, protein_accession, protein_position = result
    print(f"Gene Name: {gene_name}")
    print(f"Gene Position: {gene_position}")
    print(f"Protein Accession: {protein_accession}")
    print(f"Protein Position: {protein_position}")
else:
    print("No matching information found.")


ValueError: No records found in handle

In [1]:
import requests

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query rsid to get gene and protein information
    url = f"https://rest.ensembl.org/variation/human/{rsid}"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    
    if response.status_code == 200:
        data = response.json()
        print(data['mappings'][0])
        gene_name = data['mappings'][0]['gene_symbol']
        protein_id = data['mappings'][0]['protein_id']
    else:
        print(f"Error: Unable to fetch information for rsid {rsid}")
        return None

    # Step 2: Parse transcript location to get gene and protein positions
    transcript_accession, position = transcript_location.split(":")
    position = int(position)

    return gene_name, transcript_accession, position, protein_id

# Example usage
rsid = "rs1000002"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    gene_name, transcript_accession, position, protein_id = result
    print(f"Gene Name: {gene_name}")
    print(f"Transcript Accession: {transcript_accession}")
    print(f"Position: {position}")
    print(f"Protein ID: {protein_id}")


{'location': '3:183917980-183917980', 'start': 183917980, 'strand': 1, 'ancestral_allele': 'C', 'allele_string': 'C/T', 'assembly_name': 'GRCh38', 'seq_region_name': '3', 'coord_system': 'chromosome', 'end': 183917980}


KeyError: 'gene_symbol'

In [2]:
import requests

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query rsid to get gene and protein information
    url = f"https://rest.ensembl.org/variation/human/{rsid}"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    
    if response.status_code == 200:
        data = response.json()
        if 'mappings' in data and len(data['mappings']) > 0:
            transcript_accession = data['mappings'][0]['transcript_id']
            position = data['mappings'][0]['start']
        else:
            print(f"Error: Unable to fetch information for rsid {rsid}")
            return None
    else:
        print(f"Error: Unable to fetch information for rsid {rsid}")
        return None

    # Step 2: Parse transcript location to get gene and protein positions
    transcript_accession, position = transcript_location.split(":")
    position = int(position)

    return transcript_accession, position

# Example usage
rsid = "rs1000002"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    transcript_accession, position = result
    print(f"Transcript Accession: {transcript_accession}")
    print(f"Position: {position}")


KeyError: 'transcript_id'

In [1]:
import requests

def get_gene_and_protein_positions(rsid, transcript_location):
    # Step 1: Query transcript location to get gene and protein information
    transcript_accession, position = transcript_location.split(":")
    position = int(position)

    url = f"https://rest.ensembl.org/map/human/{transcript_accession}/{position}..{position}/GRCh38"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    
    if response.status_code == 200:
        data = response.json()
        if 'mappings' in data and len(data['mappings']) > 0:
            gene_name = data['mappings'][0]['external_name']
            protein_id = data['mappings'][0]['protein_id']
        else:
            print(f"Error: Unable to fetch information for rsid {rsid}")
            return None
    else:
        print(f"Error: Unable to fetch information for rsid {rsid}")
        return None

    return gene_name, transcript_accession, position, protein_id

# Example usage
rsid = "rs1000002"
transcript_location = "NC_000003.12:183917980"
result = get_gene_and_protein_positions(rsid, transcript_location)

if result:
    gene_name, transcript_accession, position, protein_id = result
    print(f"Gene Name: {gene_name}")
    print(f"Transcript Accession: {transcript_accession}")
    print(f"Position: {position}")
    print(f"Protein ID: {protein_id}")


Error: Unable to fetch information for rsid rs1000002


In [4]:
import requests
gene = '1HPV'
url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene
all_fastas = requests.get(url)
all_fastas

<Response [200]>

In [20]:
import requests

gene = 'ATK1'
url = 'https://rest.uniprot.org/uniprotkb/search?format=json&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene

response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    # 处理返回的JSON数据

    print(data['results'][0])
    # print(data['results'][0]['primaryAccession'])
    # for entry in data:
    #     protein_id = entry['accession']
    #     protein_name = entry['protein']['recommendedName']['fullName']
    #     print(f'Protein ID: {protein_id}, Protein Name: {protein_name}')
else:
    print(f'Error: Unable to fetch data. Status code {response.status_code}')


IndexError: list index out of range

In [34]:
import pandas as pd

all_data_df = pd.read_csv('../middlefile/gdsc_all_table.tsv', sep = '\t')
symbol_list = all_data_df['gene symbol'].unique()
new_data_df = all_data_df.copy
# for i in range(len(symbol_list)):
for i in range(3):
    gene = symbol_list[i]
    url = 'https://rest.uniprot.org/uniprotkb/search?format=json&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene
    response = requests.get(url)
    data = response.json()
    print(gene)
    print(data['results'][0]['primaryAccession'])
    new_data_df.loc[new_data_df['gene symbol'] == gene]['uniprot id'] = data['results'][0]['primaryAccession']

ABL1
P00519


AttributeError: 'function' object has no attribute 'loc'

In [3]:
import pubchempy as pcp

def get_pubchem_info(drug_name):
    try:
        # 通过药物名称查询相关信息
        compound = pcp.get_compounds(drug_name, 'name')[0]
        
        # 获取PubChem ID
        pubchem_id = compound.cid
        
        # 获取指纹 (Fingerprint)
        fingerprint = compound.cactvs_fingerprint
        
        # 获取SMILES
        smiles = compound.isomeric_smiles
        
        return pubchem_id, fingerprint, smiles
    except Exception as e:
        return None, None, None

# 使用例子
drug_name = 'WYE-125132'
pubchem_id, fingerprint, smiles = get_pubchem_info(drug_name)

if pubchem_id is not None:
    print(f"Drug Name: {drug_name}")
    print(f"PubChem ID: {pubchem_id}")
    print(f"Fingerprint: {fingerprint}")
    print(f"SMILES: {smiles}")
else:
    print(f"无法找到与 {drug_name} 相关的信息。")


Drug Name: WYE-125132
PubChem ID: 25260757
Fingerprint: 11110000011110111011100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000101100010010001001000000000000000001111000111100010000001000000000000010110000000000000000000000001011000000000011111110000000000000000000001111000000000000110000000000000000000000000000000110000111100111000011001101100000110001111111111100011011111110010000001010000000000101010100000001000110110011001110111010000000000100100101000010000101011001000011000001010100000000111011101100010100000001010000110010010011000100110110010100010100010110000001101100111010001101001111010010000001000011011001001111000000010110111101100100000100111101100001101000000000010000011100000100000000000000000000000000000000000000000000000000000000000000100001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
SMILES: CNC(=O)NC1=CC=C(C=C1)C2=NC3=C(C=NN3C4CCC5(CC4)OCCO5)C(

In [2]:
import pandas as pd
aa_sub = pd.read_csv('../../AlphaMissense/AlpahMissense_aa_substitutions.tsv', sep='\t')
aa_sub

FileNotFoundError: [Errno 2] No such file or directory: '../../AlphaMissense/AlpahMissense_aa_substitutions.tsv'

In [34]:
import requests

token_endpoint = 'https://icdaccessmanagement.who.int/connect/token'
client_id = '8ce4ac1c-84d5-4e46-b686-796ab943d907_ce952a66-3b10-41a3-9da3-131c7739883a'
client_secret = 'ajUCkbpjrbzTSUmtp8cLtVHCIL6FjPFwz/dONYQeyAM='
scope = 'icdapi_access'
grant_type = 'client_credentials'


# get the OAUTH2 token

# set data to post
payload = {'client_id': client_id, 
	   	   'client_secret': client_secret, 
           'scope': scope, 
           'grant_type': grant_type}
           
# make request
r = requests.post(token_endpoint, data=payload, verify=False).json()
token = r['access_token']


# access ICD API

uri = 'https://id.who.int/icd/entity/search?q=Lung%20Non-small%20Cell%20Carcinoma'

# HTTP header fields to set
headers = {'Authorization':  'Bearer '+token, 
           'Accept': 'json', 
           'Accept-Language': 'en',
	   'API-Version': 'v2'}
           
# make request           
r = requests.get(uri, headers=headers, verify=False)

# print the result
print (r.json())
r.json()['destinationEntities'][0]['id']			

{'error': False, 'errorMessage': None, 'resultChopped': False, 'wordSuggestionsChopped': False, 'guessType': 2, 'uniqueSearchId': '4d15cad8-aba9-4641-8c1f-9078113ae861', 'words': None, 'destinationEntities': [{'id': 'http://id.who.int/icd/entity/1969743250', 'title': "<em class='found'>Non</em> <em class='found'>small</em> <em class='found'>cell</em> <em class='found'>lung</em> cancer", 'stemId': 'http://id.who.int/icd/entity/1969743250', 'isLeaf': True, 'postcoordinationAvailability': 0, 'hasCodingNote': False, 'hasMaternalChapterLink': False, 'hasPerinatalChapterLink': False, 'matchingPVs': [{'propertyId': 'Synonym', 'label': "<em class='found'>non</em> <em class='found'>small</em> <em class='found'>cell</em> <em class='found'>lung</em> <em class='found'>carcinoma</em>", 'score': 0.0125, 'important': True, 'foundationUri': 'http://id.who.int/icd/entity/1969743250', 'propertyValueType': 0}], 'propertiesTruncated': False, 'isResidualOther': False, 'isResidualUnspecified': False, 'chapt

In [39]:
def get_pdb_id(uniprot_id):
    base_url = "https://www.uniprot.org/uniprot/"
    params = {
        "query": f"id:{uniprot_id}",
        "format": "tab",
        "columns": "id,structure",
    }

    response = requests.get(base_url, params=params)
    if response.ok:
        lines = response.text.strip().split('\n')
        # Skip header line
        if len(lines) > 1:
            # Each line contains UniProt ID and associated PDB ID(s)
            uniprot_id, pdb_ids = lines[1].split('\t')
            return pdb_ids.split(';')
    return None

# Example usage
uniprot_id = "P00519"  # Replace with your UniProt ID
pdb_ids = get_pdb_id(uniprot_id)

if pdb_ids:
    print(f"PDB IDs for UniProt ID {uniprot_id}: {', '.join(pdb_ids)}")
else:
    print(f"No PDB IDs found for UniProt ID {uniprot_id}")


No PDB IDs found for UniProt ID P00519


In [41]:
base_url = "https://www.uniprot.org/uniprot/"
params = {
    "query": f"id:{'P00519'}",
    "format": "tab",
    "columns": "id,structure",
}

response = requests.get(base_url, params=params)


In [59]:
import requests

def get_pdb_id_from_uniprot(uniprot_id):
    search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
    search_request = {
        "query": {
            "type": "group",
            "logical_operator": "and",
            "nodes": [
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "operator": "exact_match",
                        "value": uniprot_id,
                        "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession"
                    }
                },
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {
                        "operator": "exact_match",
                        "value": "UniProt",
                        "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name"
                    }
                }
            ]
        },
        "request_options": {
            "paginate": {
            "start": 0,
            "rows": 1000
            }
         },
        "return_type": "entry"
    }

    response = requests.post(search_url, json=search_request)
    
    if response.ok:
        data = response.json()
        print(data['result_set'])
        if "result_set" in data and "data" in data["result_set"]:
            for entry in data["result_set"]["data"]:
                # Extract PDB ID
                pdb_id = entry.get("rcsb_polymer_entity_container_identifiers", {}).get("entry_id")
                if pdb_id:
                    return pdb_id

    return None

# Example usage
uniprot_id = "P00519"
pdb_id = get_pdb_id_from_uniprot(uniprot_id)

if pdb_id:
    print(f"PDB ID for UniProt ID {uniprot_id}: {pdb_id}")
else:
    print(f"No PDB ID found for UniProt ID {uniprot_id}")


[{'identifier': '1AB2', 'score': 1.0}, {'identifier': '1AWO', 'score': 1.0}, {'identifier': '1BBZ', 'score': 1.0}, {'identifier': '1JU5', 'score': 1.0}, {'identifier': '1OPL', 'score': 1.0}, {'identifier': '1ZZP', 'score': 1.0}, {'identifier': '2ABL', 'score': 1.0}, {'identifier': '2E2B', 'score': 1.0}, {'identifier': '2F4J', 'score': 1.0}, {'identifier': '2FO0', 'score': 1.0}, {'identifier': '2G1T', 'score': 1.0}, {'identifier': '2G2F', 'score': 1.0}, {'identifier': '2G2H', 'score': 1.0}, {'identifier': '2G2I', 'score': 1.0}, {'identifier': '2GQG', 'score': 1.0}, {'identifier': '2HIW', 'score': 1.0}, {'identifier': '2HYY', 'score': 1.0}, {'identifier': '2HZ0', 'score': 1.0}, {'identifier': '2HZ4', 'score': 1.0}, {'identifier': '2HZI', 'score': 1.0}, {'identifier': '2O88', 'score': 1.0}, {'identifier': '2V7A', 'score': 1.0}, {'identifier': '3CS9', 'score': 1.0}, {'identifier': '3EG0', 'score': 1.0}, {'identifier': '3EG1', 'score': 1.0}, {'identifier': '3EG2', 'score': 1.0}, {'identifie

In [57]:
import pandas as pd
import requests
from tqdm import tqdm

# 读取数据
all_data_df = pd.read_csv('middlefile/fasta_table.tsv', sep='\t')

# 添加新列 'pdb list'
all_data_df['pdb list'] = None  # 或者 all_data_df['pdb list'] = pd.NA，取决于 Pandas 版本

# 获取唯一的 Uniprot ID 列表
uniprot_id_list = all_data_df['uniprot id'].unique()

# 循环处理每个 Uniprot ID
for uniprot_id in tqdm(uniprot_id_list[:10]):  # 只处理前 10 个 Uniprot ID，根据需要调整范围
    search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
    search_request = {
        # ...（你的搜索请求）
    }
    try:
        response = requests.post(search_url, json=search_request)
        pdb_list = response.json()['result_set']
        print(type(pdb_list))
    except:
        print(uniprot_id)
        pdb_list = None

    # 使用 .loc 根据条件选择行并为 'pdb list' 赋值
    all_data_df.loc[all_data_df['uniprot id'] == uniprot_id, 'pdb list'] = [pdb_list]

# 打印 DataFrame
print(all_data_df)


FileNotFoundError: [Errno 2] No such file or directory: 'middlefile/fasta_table.tsv'