# 以Biopython、NGLview處理蛋白質與ligand鍵結
https://nbviewer.jupyter.org/github/cgoliver/Notebooks/blob/master/COMP_364/L27/L27.ipynb


In [11]:
# import Bio.PDB
from Bio.PDB import *

# Access to the structure lists on the PDB
pdbl = PDBList()
# Fetch PDB structure file from PDB server, and store it locally.
pdbl.retrieve_pdb_file('4XP1')

Structure exists: '/home/magic/xp/4xp1.cif' 




'/home/magic/xp/4xp1.cif'

In [12]:
# Last step download a folder in our working directory
import os
os.listdir()

['Downloads',
 '2_pdb_superimpose.ipynb',
 'Untitled6.ipynb',
 '.bashrc',
 'Public',
 'Documents',
 'Pictures',
 '1ubq.pdb',
 '.bash_history',
 '.pki',
 '.gphoto',
 'Untitled11.ipynb',
 '.config',
 '.steampath',
 'anaconda3',
 '.ICEauthority',
 'Untitled7.ipynb',
 '.thunderbird',
 '.profile',
 '.mozilla',
 'Untitled9.ipynb',
 '1UBQ_aligned.pdb',
 'Untitled12.ipynb',
 '.python_history-17470.tmp',
 'cif_distance.ipynb',
 'Music',
 'breast_cancer_kernel.ipynb',
 '.condarc',
 'ls_orchid.fasta',
 'Untitled.ipynb',
 '.ipython',
 '.chewing',
 'snap',
 'Untitled5.ipynb',
 '.python_history',
 '.gnupg',
 'Videos',
 'Untitled4.ipynb',
 '.ipynb_checkpoints',
 '.chimera',
 '.jupyter',
 '.gnome',
 'Untitled1.ipynb',
 'breast_cancer.ipynb',
 '.sudo_as_admin_successful',
 '.ssh',
 'Templates',
 'google-chrome-stable_current_amd64.deb',
 '2klg.pdb',
 'Untitled10.ipynb',
 '.steampid',
 '.pulse-cookie',
 'Untitled2.ipynb',
 'Untitled13.ipynb',
 '.anaconda',
 '.local',
 '.cache',
 'Untitled8.ipynb',
 '.co

In [13]:
os.listdir('xp')

['4xp1.cif']

# Parsing Structure files

In [14]:
# Parse a mmCIF file and return a Structure object.
parser = MMCIFParser()

In [15]:
# Return the structure.
# get_structure(self, structure_id, filename)
structure = parser.get_structure('4XP1', 'xp/4xp1.cif')



In [17]:
# Check the attributes.
def cleandir(obj):
    print(', '.join([a for a in dir(obj) if not a.startswith("_")]))
cleandir(structure)

add, child_dict, child_list, copy, detach_child, detach_parent, full_id, get_atoms, get_chains, get_full_id, get_id, get_iterator, get_level, get_list, get_models, get_parent, get_residues, has_id, header, id, insert, level, parent, set_parent, transform, xtra


# Model -> Chain -> Residue -> Atom
- Model: whole structure, can contain multiple models
- Chain: Tertiary structure of protein
- Residue: amino acids
- atom: atom

# Visualizing structures in jupyter Notebooks
NGLview troubleshooting (When NGLview can't show image on jupyter)

https://github.com/SBRG/ssbio/wiki/Troubleshooting#nglviewer-fresh-install-tips

In [18]:
# import NGLview
import nglview as nv

In [36]:
# Show NGLwidget with Biopython structural entity.(Structure, Model, Chain, Residue or Atom)
view = nv.show_biopython(structure)
# Show structure（預設是cartoon)
view

NGLWidget()

In [37]:
# clear all representations for given component. 不加此行，會同時顯示cartoon跟ball_and_stick
view.clear_representations()
view.add_ball_and_stick()
view

NGLWidget(n_components=1)

In [42]:
# protein以cartoon顯示，其餘分子以ball_and_stick顯示
view.clear_representations()
view.add_cartoon('protein')
view.add_ball_and_stick('not protein')
view

NGLWidget(n_components=1)

In [50]:
# 顯示structure中有多少model，此範例只有一個
for model in structure:
    print(f"model {model}")

model <Model id=0>


In [51]:
# 顯示model中有多少chain，要先選擇structure中的某個model才能選擇chain
model = structure[0]
for chain in model:
    print(f"chain {chain}, Chain ID: {chain.id}")

chain <Chain id=A>, Chain ID: A
chain <Chain id=L>, Chain ID: L
chain <Chain id=H>, Chain ID: H


In [52]:
# 選擇model中的chain A
chain_A = model['A']

In [53]:
# 顯示chain A中的所有residue，列出殘基縮寫與殘基編號(殘基包含胺基酸、鍵結分子、溶劑分子等）
for res in chain_A:
    print(f"Residue name: {res.resname}, number: {res.id[1]}")

Residue name: ASP, number: 25
Residue name: GLU, number: 26
Residue name: ARG, number: 27
Residue name: GLU, number: 28
Residue name: THR, number: 29
Residue name: TRP, number: 30
Residue name: SER, number: 31
Residue name: GLY, number: 32
Residue name: LYS, number: 33
Residue name: VAL, number: 34
Residue name: ASP, number: 35
Residue name: PHE, number: 36
Residue name: LEU, number: 37
Residue name: LEU, number: 38
Residue name: SER, number: 39
Residue name: VAL, number: 40
Residue name: ILE, number: 41
Residue name: GLY, number: 42
Residue name: PHE, number: 43
Residue name: ALA, number: 44
Residue name: VAL, number: 45
Residue name: ASP, number: 46
Residue name: LEU, number: 47
Residue name: ALA, number: 48
Residue name: ASN, number: 49
Residue name: VAL, number: 50
Residue name: TRP, number: 51
Residue name: ARG, number: 52
Residue name: PHE, number: 53
Residue name: PRO, number: 54
Residue name: TYR, number: 55
Residue name: LEU, number: 56
Residue name: CYS, number: 57
Residue na

In [56]:
# 選擇chain A的第56個殘基
res = chain_A[56]

In [57]:
# 顯示所選擇的殘基
print(res)

<Residue LEU het=  resseq=56 icode= >


In [18]:
# 顯示該殘基的所有原子名(IUPAC命名）
for atom in res:
    print(f'{atom.name}')

N
CA
C
O
CB
CG
CD1
CD2


In [58]:
# 顯示structrue中的所有原子名
for model in structure:
    for chain in model:
        for residue in chain:
            for atom in residue:
                print(atom)

<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom OD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom OE1>
<Atom OE2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom NE>
<Atom CZ>
<Atom NH1>
<Atom NH2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom OE1>
<Atom OE2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom NE1>
<Atom CE2>
<Atom CE3>
<Atom CZ2>
<Atom CZ3>
<Atom CH2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom CE>
<Atom NZ>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom OD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<

<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom SG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom CD1>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<Atom CE2>
<Atom CZ>
<Atom OH>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom CD1>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom SG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<Atom CE2>
<Atom CZ>
<Atom OH>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<Atom CE2>
<Atom CZ>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG>

<Atom O>
<Atom CB>
<Atom OG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom OE1>
<Atom OE2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom OD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom CD1>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom NE>
<Atom CZ>
<Atom NH1>
<Atom NH2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom OD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom SD>
<Atom CE>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom CD1>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<Atom CE2>
<Atom CZ>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<A

<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom SD>
<Atom CE>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom CE>
<Atom NZ>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom OD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom CE1>
<Atom CE2>
<Atom CZ>
<Atom OH>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom OE1>
<Atom OE2>
<Atom N>
<Atom CA>
<Atom C>
<A

In [63]:
# Parse mmcif file information into a dictionary.
# mmcif檔案內容 http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/index.html
struc_dict = MMCIF2Dict.MMCIF2Dict('xp/4xp1.cif')

In [64]:
# Show keys of struc_dict
print(struc_dict.keys())

dict_keys(['data_', '_entry.id', '_audit_conform.dict_name', '_audit_conform.dict_version', '_audit_conform.dict_location', '_database_2.database_id', '_database_2.database_code', '_pdbx_database_related.db_name', '_pdbx_database_related.details', '_pdbx_database_related.db_id', '_pdbx_database_related.content_type', '_pdbx_database_status.status_code', '_pdbx_database_status.status_code_sf', '_pdbx_database_status.status_code_mr', '_pdbx_database_status.entry_id', '_pdbx_database_status.recvd_initial_deposition_date', '_pdbx_database_status.SG_entry', '_pdbx_database_status.deposit_site', '_pdbx_database_status.process_site', '_pdbx_database_status.status_code_cs', '_pdbx_database_status.methods_development_category', '_pdbx_database_status.pdb_format_compatible', '_audit_author.name', '_audit_author.pdbx_ordinal', '_citation.abstract', '_citation.abstract_id_CAS', '_citation.book_id_ISBN', '_citation.book_publisher', '_citation.book_publisher_city', '_citation.book_title', '_citation

In [73]:
# _struct_site: structurally relevent sites(active sites, substract-binding subsites, metal-coordination sites)
struc_dict['_struct_site.details']

['binding site for residue NA A 701',
 'binding site for residue NA A 702',
 'binding site for residue CL A 703',
 'binding site for residue MAL A 704',
 'binding site for residue MAL A 705',
 'binding site for residue P4G A 707',
 'binding site for residue LDP A 708',
 'binding site for residue EDO A 709',
 'binding site for residue Y01 A 710',
 'binding site for residue CLR A 711',
 'binding site for residue NA L 301',
 'binding site for Mono-Saccharide NAG A 706 bound to ASN A 141']

In [74]:
# Find id of LDP: AC7
struc_dict['_struct_site.id']

['AC1',
 'AC2',
 'AC3',
 'AC4',
 'AC5',
 'AC6',
 'AC7',
 'AC8',
 'AC9',
 'AD1',
 'AD2',
 'AD3']

In [23]:
# 顯示文獻標題
struc_dict['_citation.title']

['Neurotransmitter and psychostimulant recognition by the dopamine transporter.']

In [78]:
site_ID = struc_dict['_struct_site_gen.site_id']
site_chain = struc_dict['_struct_site_gen.auth_asym_id']
site_resnum = struc_dict['_struct_site_gen.auth_seq_id']
site_resname = struc_dict['_struct_site_gen.label_comp_id']

# 找所有binding site ID為AC7的殘基
cif_binding_residues = []
for bind_id, ch, num, name in zip(site_ID, site_chain, site_resnum, site_resname):
    if bind_id == 'AC7':
        print(bind_id, ch, num, name)
        try:
            cif_binding_residues.append(structure[0][ch][int(num)])
        except:
            continue
    else:
        continue

AC7 A 46 ASP
AC7 A 117 ALA
AC7 A 120 VAL
AC7 A 121 ASP
AC7 A 124 TYR
AC7 A 325 PHE
AC7 A 422 SER
AC7 A 425 GLY
AC7 A 812 HOH


In [111]:
# get_residues(): 擷取特定殘基的資訊
LDP = None
for res in structure[0].get_residues():
    if res.resname == "LDP":
        LDP = res
        break
print(LDP)

<Residue LDP het=H_LDP resseq=708 icode= >


In [181]:
cutoff = 5

binding_residues = []
# 找所有model中與LDP距離5 angstroms的殘基
for chain in model:
    for res in chain:
        # 跳過LDP本身
        if res == LDP:
            continue
        # 跳過非胺基酸殘基('H')
        elif res.id[0].startswith("H"):
            continue
        else:
            alpha_carbon = res['CA']
            distances = []
            for atom in LDP:
                # 兩個殘基X, Y, Z座標的差(LDP為整個分子，其他殘基為'CA'原子，所以個每個殘基會產生多個差）
                diff_vector = alpha_carbon.coord - atom.coord
                # 兩個殘基的距離
                distances.append(np.sqrt(np.sum(diff_vector * diff_vector)))
            # 只要殘基與LDP的最小距離< cutoff，殘基就會被加入binding_residues
            if min(distances) < cutoff:
                binding_residues.append(res.id[1])
# 會跳出KeyError，但binding_residues還是有收到所需的id

KeyError: 'CA'

In [191]:
cutoff = 5

binding_residues = []
# 找所有model中與LDP距離5 angstroms的殘基
for res in chain:
        # 跳過LDP本身
    if res == LDP:
        continue
        # 跳過非胺基酸殘基('H')
    elif res.id[0].startswith("H"):
        continue
    else:
        alpha_carbon = res['CA']
        distances = []
        for atom in LDP:
                # 兩個殘基X, Y, Z座標的差(LDP為整個分子，其他殘基為'CA'原子，所以個每個殘基會產生多個差）
            diff_vector = alpha_carbon.coord - atom.coord
                # 兩個殘基的距離
            distances.append(np.sqrt(np.sum(diff_vector * diff_vector)))
            # 只要殘基與LDP的最小距離< cutoff，殘基就會被加入binding_residues
        if min(distances) < cutoff:
            binding_residues.append(res.id[1])
# 會跳出KeyError，但binding_residues還是有收到所需的id

KeyError: 'CA'

In [192]:
binding_residues

[117, 118, 120, 121, 421, 422, 425]

# 參考資料的_set_color_by_residue無法使用，改使用以下方式顯示顏色
https://nbviewer.jupyter.org/github/arose/nglview/blob/644fa66a8461b52c01a5e4df4d2a99486b690fea/notebooks/custom_color.ipynb

In [168]:
from nglview.color import ColormakerRegistry

In [184]:
chain = structure[0]['A']

In [185]:
chain

<Chain id=A>

In [193]:
binding_residues_str = ', '.join(map(str, binding_residues))

In [194]:
binding_residues_str

'117, 118, 120, 121, 421, 422, 425'

In [167]:
cm = ColormakerRegistry
cm.add_selection_scheme("awesome", [['blue', binding_residues_str]])

In [195]:
view = nv.show_biopython(structure)
view.clear_representations()
view.add_cartoon(color='awesome')
view.add_ball_and_stick('not protein')
view
# 會選到別的chain編號一樣的殘基

NGLWidget()