In [1]:
!pip install Bio

Collecting Bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from Bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, Bio
Successfully installed Bio-1.5.9 biopython-1.81 biothings-client-0.3.0 gprofiler-official-1.0.0 mygene-3.2.2


# Map Uniprot Residues to PDB Residues

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from Bio.PDB.Polypeptide import three_to_one
import pandas as pd
import numpy as np
import requests
import tarfile
import gzip
import time
import json
import re
import os
import ast
from tqdm import tqdm

Mounted at /content/drive


## All Functions

## Functions to use to get uniprot accession

In [3]:
def get_response(pdb_id):
  # gets uniprot accession of a pdb
  pdb_id = pdb_id.lower()
  response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}")
  response.close()
  response = response.json()
  return response

In [4]:
def get_uniprot_acc(response, pdb_id):
  # response is the output of the function get_response(pdb_id)

  # get uniprot accession
  uniprot_acc = list(response[f"{pdb_id}"]["UniProt"].keys())

  # get chain_id and uniprot accession
  uniprot = [[response[f"{pdb_id}"]["UniProt"][f"{uniprot_acc[i]}"]["mappings"][j]["chain_id"], f"{uniprot_acc[i]}"]
                      for i in range(len(uniprot_acc))
                      for j in range(len(response[f"{pdb_id}"]["UniProt"][f"{uniprot_acc[i]}"]["mappings"]))]

  return uniprot

## Functions to map pdb residues to uniprot residues

In [5]:
def get_pdb_residues(pdb_id, chain_id):
  # get residues of a pdb, residue_number and residue_name
  try:
    pdb_id = pdb_id.lower()
    response = requests.get(f'https://www.ebi.ac.uk/pdbe/api/pdb/entry/residue_listing/{pdb_id}/chain/{chain_id}')
    pdb_residues = json.loads(response.text)[pdb_id]['molecules']
    response.close()
    return pdb_residues

  except:
    return None

In [9]:
def get_entity_id_and_res_start_ends(pdb_residues, chain_id):
  # get entity id of a chain, residue number of the first residue and last residue
  try:
      for entry in pdb_residues:
        entity_id = entry['entity_id']
        for chain in entry['chains']:
          dict_residues = [d['residue_number'] for d in chain['residues']]
          res_start = int(min(dict_residues))
          res_end = int(max(dict_residues))

          return entity_id, res_start, res_end
  except TypeError:
    entity_id, res_start, res_end = 0, 0, 0
    return entity_id, res_start, res_end

In [7]:
def get_pdb_uniprot_residue_mapping(pdb_id, uniprot_id, chain_id, entity_id, res_start, res_end):
  # get unp_residue_number and unp_one_letter_code for each residue of a pdb
  # inputs: pdb_id (str), uniprot_id (str), chain_id (str), entity_id (int), res_start (int), res_end (int)
  if entity_id == 0:
    return [pdb_id + '_' + chain_id, 0, 0, uniprot_id, 0, 0]

  else:
    try:
      response = requests.get(f'https://www.ebi.ac.uk/pdbe/graph-api/residue_mapping/{pdb_id}/{entity_id}/{res_start}/{res_end}')

      response.close()
      response = json.loads(response.text)

      residues = response[pdb_id][0]['chains'][0]['residues']

      residues_uniprot = []
      for i in range(len(residues)):
        if uniprot_id in residues[i]['features']['UniProt'].keys():
          pdb_residue_number = residues[i]['residue_number']
          uniprot_res_dict = residues[i]['features']['UniProt'][uniprot_id]
          unp_residue_number = uniprot_res_dict['unp_residue_number']
          unp_one_letter_code = uniprot_res_dict['unp_one_letter_code']
          pdb_one_letter_code = uniprot_res_dict['pdb_one_letter_code']
          residues_uniprot.append([pdb_id + '_' + chain_id, pdb_residue_number, pdb_one_letter_code, uniprot_id, unp_residue_number, unp_one_letter_code])

      return residues_uniprot

    except (ValueError, IndexError):
      return [pdb_id + '_' + chain_id, 0, 0, uniprot_id, 0, 0]

Example usage for one chain only

In [None]:
# get mapping of pdb residyes to uniprot residues for a single pdb chain
# resulting columns are 'chain_id', 'pdb_residue_number', 'pdb_one_letter_code', 'uniprot_id', 'uniprot_residue_number', 'uniprot_one_letter_code'
pdb_id, chain_id, uniprot_id = '2h60'.lower(), 'A', 'P51532'
pdb_residues = get_pdb_residues(pdb_id, chain_id)
entity_id, res_start, res_end = get_entity_id_and_res_start_ends(pdb_residues, chain_id)
residues_uniprot = get_pdb_uniprot_residue_mapping(pdb_id, uniprot_id, chain_id, entity_id, res_start, res_end)
residues_uniprot

# Dataset

In [19]:
path ='/content/drive/MyDrive/Colab Notebooks/Bioinformatics'

In [27]:
df = pd.read_csv(os.path.join(path, 'Docking_Results.txt'),sep='\t')
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,target_1,target_2,interface,energy,time
0,1k05B,5vvcD,2z3rGK,-12.999,2023-08-05 08:08:34
1,1k05B,5vvcD,3ezqAC,-11.740,2023-08-05 08:08:34
2,2h60A,7x5eB,3l0lAB,-8.678,2023-08-05 08:08:34
3,2h60A,7x5fF,3l0lAB,-6.398,2023-08-05 08:08:34
4,2lz1A,7vrbA,3tlxAC,-10.374,2023-08-05 08:08:34
...,...,...,...,...,...
18762,5buoA,1jm4B,2nxbAB,-17.144,2023-08-04 14:07:13
18763,5buoA,1n72A,1hzdBF,-11.370,2023-08-04 14:07:13
18764,5buoA,1n72A,1oedAE,-14.311,2023-08-04 14:07:13
18765,5buoA,1n72A,1r0dAI,-7.545,2023-08-04 14:07:13


In [28]:
# unique pdbs in the docking results
pdb_list = list(set(df['target_1'].tolist()).union(set(df['target_2'].tolist())))
len(pdb_list), pdb_list

(220,
 ['2vqjA',
  '1j3sA',
  '2k04B',
  '7apjA',
  '7y8rI',
  '1gzkA',
  '7b5lL',
  '3ojvC',
  '4rwkA',
  '2kjeA',
  '3nylA',
  '6hokA',
  '3bceB',
  '4ec4A',
  '7el4A',
  '2l2tB',
  '1unpA',
  '2o61B',
  '7lvsF',
  '3oduA',
  '4zdrA',
  '2n9jA',
  '5hezB',
  '5houA',
  '5hheA',
  '3od6X',
  '3lueA',
  '6yl6A',
  '2l9uB',
  '4e3cA',
  '4yknA',
  '7tb3A',
  '5vvcD',
  '7vrbA',
  '6xreM',
  '2cr9A',
  '1wugA',
  '1vywA',
  '4hc9A',
  '2lqiB',
  '3u7uB',
  '5etcA',
  '4opxA',
  '7b3kA',
  '1qe6D',
  '1jm4B',
  '2lp1A',
  '1c9qA',
  '5w21C',
  '2ahxB',
  '6es5B',
  '2riqA',
  '2opzA',
  '2n8aA',
  '1ba4A',
  '2h96A',
  '2r4bA',
  '6gu7C',
  '3q05A',
  '5i8bA',
  '1va1A',
  '6yhfA',
  '6t58A',
  '2h8nA',
  '7xijA',
  '1ni6C',
  '5hpdA',
  '1i3oE',
  '1kswA',
  '1zoqC',
  '4gcjA',
  '6qvwA',
  '2k8fB',
  '2knaA',
  '2uzkA',
  '1xo2B',
  '8ba3A',
  '3oe8B',
  '8f2hA',
  '3moqA',
  '2k86A',
  '3q05B',
  '7qj6E',
  '3q01B',
  '3gb8A',
  '4bsmA',
  '4yc3A',
  '2kwfA',
  '4kikB',
  '5disA',
  '7

In [29]:
df_pdbs_nonredundant = pd.DataFrame(pdb_list, columns=['chain'])
df_pdbs_nonredundant

Unnamed: 0,chain
0,2vqjA
1,1j3sA
2,2k04B
3,7apjA
4,7y8rI
...,...
215,2lxsA
216,6s9wA
217,1blxA
218,2h60A


In [36]:
# split pdb id and chain of chains
df_pdb = pd.DataFrame(columns=['pdbID', 'chain'])
df_pdb['pdbID'] = df_pdbs_nonredundant['chain'].apply(lambda x: x[:4])
df_pdb['chain'] = df_pdbs_nonredundant['chain'].apply(lambda x: x[4:])
df_pdb

Unnamed: 0,pdbID,chain
0,2vqj,A
1,1j3s,A
2,2k04,B
3,7apj,A
4,7y8r,I
...,...,...
215,2lxs,A
216,6s9w,A
217,1blx,A
218,2h60,A


In [38]:
pdb_id_list = df_pdb['pdbID'].to_list()
pdb_id_list

['2vqj',
 '1j3s',
 '2k04',
 '7apj',
 '7y8r',
 '1gzk',
 '7b5l',
 '3ojv',
 '4rwk',
 '2kje',
 '3nyl',
 '6hok',
 '3bce',
 '4ec4',
 '7el4',
 '2l2t',
 '1unp',
 '2o61',
 '7lvs',
 '3odu',
 '4zdr',
 '2n9j',
 '5hez',
 '5hou',
 '5hhe',
 '3od6',
 '3lue',
 '6yl6',
 '2l9u',
 '4e3c',
 '4ykn',
 '7tb3',
 '5vvc',
 '7vrb',
 '6xre',
 '2cr9',
 '1wug',
 '1vyw',
 '4hc9',
 '2lqi',
 '3u7u',
 '5etc',
 '4opx',
 '7b3k',
 '1qe6',
 '1jm4',
 '2lp1',
 '1c9q',
 '5w21',
 '2ahx',
 '6es5',
 '2riq',
 '2opz',
 '2n8a',
 '1ba4',
 '2h96',
 '2r4b',
 '6gu7',
 '3q05',
 '5i8b',
 '1va1',
 '6yhf',
 '6t58',
 '2h8n',
 '7xij',
 '1ni6',
 '5hpd',
 '1i3o',
 '1ksw',
 '1zoq',
 '4gcj',
 '6qvw',
 '2k8f',
 '2kna',
 '2uzk',
 '1xo2',
 '8ba3',
 '3oe8',
 '8f2h',
 '3moq',
 '2k86',
 '3q05',
 '7qj6',
 '3q01',
 '3gb8',
 '4bsm',
 '4yc3',
 '2kwf',
 '4kik',
 '5dis',
 '7p1h',
 '3oe0',
 '1olg',
 '3epz',
 '3krj',
 '2jvn',
 '6les',
 '6vg8',
 '3d0e',
 '5zcs',
 '2k05',
 '6yhp',
 '2l30',
 '6zwm',
 '5mez',
 '7bwn',
 '1cm0',
 '6e2q',
 '3ktm',
 '1l3e',
 '2dbf',
 

In [40]:
# get uniprot ids using get_response() and get_uniprot_acc(functions)
for pdb_id in pdb_id_list:
    pdb_id = pdb_id.lower()
    response = get_response(pdb_id)
    uniprot = get_uniprot_acc(response, pdb_id)

    for i in range(len(uniprot)):
        index = df_pdb.index[(df_pdb["pdbID"]==pdb_id) & (df_pdb["chain"]==uniprot[i][0])]
        df_pdb.loc[index, ["uniprot_acc"]] = uniprot[i][1]
        # to track
        #print(index, pdb_id, uniprot[i][1])

2vqj
1j3s
2k04
7apj
7y8r
1gzk
7b5l
3ojv
4rwk
2kje
3nyl
6hok
3bce
4ec4
7el4
2l2t
1unp
2o61
7lvs
3odu
4zdr
2n9j
5hez
5hou
5hhe
3od6
3lue
6yl6
2l9u
4e3c
4ykn
7tb3
5vvc
7vrb
6xre
2cr9
1wug
1vyw
4hc9
2lqi
3u7u
5etc
4opx
7b3k
1qe6
1jm4
2lp1
1c9q
5w21
2ahx
6es5
2riq
2opz
2n8a
1ba4
2h96
2r4b
6gu7
3q05
5i8b
1va1
6yhf
6t58
2h8n
7xij
1ni6
5hpd
1i3o
1ksw
1zoq
4gcj
6qvw
2k8f
2kna
2uzk
1xo2
8ba3
3oe8
8f2h
3moq
2k86
3q05
7qj6
3q01
3gb8
4bsm
4yc3
2kwf
4kik
5dis
7p1h
3oe0
1olg
3epz
3krj
2jvn
6les
6vg8
3d0e
5zcs
2k05
6yhp
2l30
6zwm
5mez
7bwn
1cm0
6e2q
3ktm
1l3e
2dbf
1l8c
6apx
7sc0
1j1b
6lhd
2lgc
1ikn
7p8w
4ddp
6ltj
8a8m
7b3j
2loh
2zoq
1n72
3co6
2f1x
7jul
3kxx
5zoo
1g73
2dmj
3oe8
6iyc
2loh
3byh
1iyt
2l5g
3qkm
1owt
1vca
6zr5
4ic2
7tvb
7vdv
1qqg
7s6h
6vgl
2lqh
3u2p
3ktm
4und
6d65
3u7u
5hp0
4e3c
2llm
3oll
3oe8
2cs2
1vsc
4a69
7w7z
4ny0
4rws
6es7
7x5f
5swp
4bkx
5ea1
2lz1
6yq1
1t2k
2l9u
2k04
2lcx
6lhd
6c4s
1dd1
3odu
6yhi
4n4f
1il8
6d66
7mn6
3g76
6xnk
2cr3
4pqd
5ydr
2l2t
1y57
7tbh
3p11
7o7b
5buo
2br9
6d67
2z6h


In [42]:
# add uniprot accession to docking results
df_pdbs_nonredundant['PDB'] = df_pdb['pdbID'].apply(lambda x: x.upper())
df_pdbs_nonredundant['chain'] = df_pdb['pdbID'] + df_pdb['chain']
df_pdbs_nonredundant['uniprot_acc'] = df_pdb['uniprot_acc']
df_pdbs_nonredundant

Unnamed: 0,chain,PDB,uniprot_acc
0,2vqjA,2VQJ,P56524
1,1j3sA,1J3S,P99999
2,2k04B,2K04,P61073
3,7apjA,7APJ,M4MD44
4,7y8rI,7Y8R,P51532
...,...,...,...
215,2lxsA,2LXS,Q92793
216,6s9wA,6S9W,P31749
217,1blxA,1BLX,Q00534
218,2h60A,2H60,P51532


In [43]:
df_pdbs_nonredundant['chain_id'] = df_pdbs_nonredundant['chain'].apply(lambda x: x[4:])
df_pdbs_nonredundant

Unnamed: 0,chain,PDB,uniprot_acc,chain_id
0,2vqjA,2VQJ,P56524,A
1,1j3sA,1J3S,P99999,A
2,2k04B,2K04,P61073,B
3,7apjA,7APJ,M4MD44,A
4,7y8rI,7Y8R,P51532,I
...,...,...,...,...
215,2lxsA,2LXS,Q92793,A
216,6s9wA,6S9W,P31749,A
217,1blxA,1BLX,Q00534,A
218,2h60A,2H60,P51532,A


## Map uniprot residues to pdb residues

In [44]:
# this part takes a while, tqdm to show progress
tqdm.pandas()
# get mapping of pdb residyes to uniprot residues
# functions used (in order!) get_pdb_residues(), get_entity_id_and_res_start_ends(), get_pdb_uniprot_residue_mapping()
# example output: [[res1, res2, ...]] = [[chain_id, pdb_residue_number, pdb_residue_name, uniprot_acc, unp_residue_number, unp_residue_name]] [[2h60_A, 1, M, P51532, 1451, L] ...]
df_pdbs_nonredundant['pdb_residues'] = df_pdbs_nonredundant.progress_apply(lambda x: get_pdb_residues(x['PDB'], x['chain_id']), axis=1)
df_pdbs_nonredundant['entity_id'] = df_pdbs_nonredundant.progress_apply(lambda x: get_entity_id_and_res_start_ends(x['pdb_residues'], x['chain_id']), axis=1)
df_pdbs_nonredundant[['entity_id', 'res_start', 'res_end']] = pd.DataFrame(df_pdbs_nonredundant['entity_id'].tolist(), index=df.index, dtype=int)
df_pdbs_nonredundant = df_pdbs_nonredundant.fillna(0)
df_pdbs_nonredundant['residues_uniprot'] = df_pdbs_nonredundant.progress_apply(lambda x: get_pdb_uniprot_residue_mapping(x['PDB'].lower(), x['uniprot_acc'], x['chain_id'], int(x['entity_id']), int(x['res_start']), int(x['res_end'])), axis=1)
df_pdbs_nonredundant

100%|██████████| 220/220 [01:59<00:00,  1.83it/s]
100%|██████████| 220/220 [00:00<00:00, 6346.92it/s]
100%|██████████| 220/220 [26:37<00:00,  7.26s/it]


Unnamed: 0,chain,PDB,uniprot_acc,chain_id,pdb_residues,entity_id,res_start,res_end,residues_uniprot
0,2vqjA,2VQJ,P56524,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,413,"[[2vqj_A, 4, T, P56524, 648, T], [2vqj_A, 5, K..."
1,1j3sA,1J3S,P99999,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,104,"[[1j3s_A, 1, G, P99999, 2, G], [1j3s_A, 2, D, ..."
2,2k04B,2K04,P61073,B,"[{'entity_id': 2, 'chains': [{'struct_asym_id'...",2,1,40,"[[2k04_B, 3, M, P61073, 1, M], [2k04_B, 4, E, ..."
3,7apjA,7APJ,M4MD44,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,440,"[[7apj_A, 122, A, M4MD44, 121, A], [7apj_A, 12..."
4,7y8rI,7Y8R,P51532,I,"[{'entity_id': 6, 'chains': [{'struct_asym_id'...",6,1,1647,"[7y8r_I, 0, 0, P51532, 0, 0]"
...,...,...,...,...,...,...,...,...,...
215,2lxsA,2LXS,Q92793,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,87,"[[2lxs_A, 1, G, Q92793, 587, G], [2lxs_A, 2, V..."
216,6s9wA,6S9W,P31749,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,446,"[[6s9w_A, 2, S, P31749, 2, S], [6s9w_A, 3, D, ..."
217,1blxA,1BLX,Q00534,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,326,"[[1blx_A, 1, M, Q00534, 1, M], [1blx_A, 2, E, ..."
218,2h60A,2H60,P51532,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,128,"[[2h60_A, 1, M, P51532, 1451, L], [2h60_A, 2, ..."


In [45]:
# the problematic ones
df_pdbs_nonredundant.loc[df_pdbs_nonredundant['residues_uniprot'].apply(lambda x: len(x)<=6)]

Unnamed: 0,chain,PDB,uniprot_acc,chain_id,pdb_residues,entity_id,res_start,res_end,residues_uniprot
4,7y8rI,7Y8R,P51532,I,"[{'entity_id': 6, 'chains': [{'struct_asym_id'...",6,1,1647,"[7y8r_I, 0, 0, P51532, 0, 0]"
30,4yknA,4YKN,P42336,A,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,1383,"[4ykn_A, 0, 0, P42336, 0, 0]"
99,5zcsE,5ZCS,Q6R327,E,"[{'entity_id': 3, 'chains': [{'struct_asym_id'...",3,1,1708,"[5zcs_E, 0, 0, Q6R327, 0, 0]"
103,6zwmE,6ZWM,Q6R327,E,"[{'entity_id': 3, 'chains': [{'struct_asym_id'...",3,1,1708,"[6zwm_E, 0, 0, Q6R327, 0, 0]"
120,6ltjI,6LTJ,P51532,I,"[{'entity_id': 5, 'chains': [{'struct_asym_id'...",5,1,1647,"[6ltj_I, 0, 0, P51532, 0, 0]"
145,7vdvA,7VDV,P51532,A,"[{'entity_id': 7, 'chains': [{'struct_asym_id'...",7,1,1485,"[7vdv_A, 0, 0, P51532, 0, 0]"
207,4wxxB,4WXX,P26358,B,"[{'entity_id': 1, 'chains': [{'struct_asym_id'...",1,1,1256,"[4wxx_B, 0, 0, P26358, 0, 0]"


In [46]:
# example of mappings
df_pdbs_nonredundant.loc[0, 'residues_uniprot']

[['2vqj_A', 4, 'T', 'P56524', 648, 'T'],
 ['2vqj_A', 5, 'K', 'P56524', 649, 'K'],
 ['2vqj_A', 6, 'P', 'P56524', 650, 'P'],
 ['2vqj_A', 7, 'R', 'P56524', 651, 'R'],
 ['2vqj_A', 8, 'F', 'P56524', 652, 'F'],
 ['2vqj_A', 9, 'T', 'P56524', 653, 'T'],
 ['2vqj_A', 10, 'T', 'P56524', 654, 'T'],
 ['2vqj_A', 11, 'G', 'P56524', 655, 'G'],
 ['2vqj_A', 12, 'L', 'P56524', 656, 'L'],
 ['2vqj_A', 13, 'V', 'P56524', 657, 'V'],
 ['2vqj_A', 14, 'Y', 'P56524', 658, 'Y'],
 ['2vqj_A', 15, 'D', 'P56524', 659, 'D'],
 ['2vqj_A', 16, 'T', 'P56524', 660, 'T'],
 ['2vqj_A', 17, 'L', 'P56524', 661, 'L'],
 ['2vqj_A', 18, 'M', 'P56524', 662, 'M'],
 ['2vqj_A', 19, 'L', 'P56524', 663, 'L'],
 ['2vqj_A', 20, 'K', 'P56524', 664, 'K'],
 ['2vqj_A', 21, 'H', 'P56524', 665, 'H'],
 ['2vqj_A', 22, 'Q', 'P56524', 666, 'Q'],
 ['2vqj_A', 23, 'C', 'P56524', 667, 'C'],
 ['2vqj_A', 24, 'T', 'P56524', 668, 'T'],
 ['2vqj_A', 25, 'C', 'P56524', 669, 'C'],
 ['2vqj_A', 26, 'G', 'P56524', 670, 'G'],
 ['2vqj_A', 27, 'S', 'P56524', 671, 'S']