동일한 단백질의 conformation이 다른 경우, RMSF를 계산한다. res_num, res_type을 비교해서 merge하고(두 값이 다른 경우에는 빠짐), RMSD를 구한다.

In [173]:
import pandas as pd
import math
import os 

In [174]:
path = '/home/siu/temp6'
os.chdir(path)

## Parse PDB. Extract Atom Names & Coords

In [251]:
ls

1.csv                           TMEM175_Q65P_pH7_5_chainA.pdb
3.csv                           TMEM175_WT_LAMP_8FY5_chainA.pdb
4.csv                           TMEM175_WT_pH4_5_chainA.pdb
5.csv                           TMEM175_WT_pH4_5_DCPIB_ACD_chainA.pdb
TMEM175_M393T_pH5_5_chainA.pdb  TMEM175_WT_pH4_5_ML6733_200uM__chainA.pdb
TMEM175_M393T_pH7_5_chainA.pdb  TMEM175_WT_pH7_5_chainA.pdb


In [306]:
fileA = 'TMEM175_WT_LAMP_8FY5_chainA.pdb'
fileB = 'TMEM175_WT_pH4_5_ML6733_200uM__chainA.pdb'

In [307]:
# parse pdb file

f = open(fileA, mode='r')
tableA = pd.DataFrame()

for line in f:
    if line.startswith('ATOM'):
        row = line.split()[:11]
        tableA = pd.concat([tableA, pd.DataFrame([row])], ignore_index=True)

tableA[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,ATOM,1,N,ILE,A,30,138.706,148.565,140.003,1.0,53.24
1,ATOM,2,CA,ILE,A,30,138.052,149.757,140.61,1.0,53.24
2,ATOM,3,C,ILE,A,30,137.879,149.523,142.104,1.0,53.24
3,ATOM,4,O,ILE,A,30,137.33,148.504,142.517,1.0,53.24
4,ATOM,5,CB,ILE,A,30,136.697,150.057,139.942,1.0,53.24
5,ATOM,6,CG1,ILE,A,30,136.878,150.323,138.444,1.0,53.24
6,ATOM,7,CG2,ILE,A,30,136.029,151.255,140.608,1.0,53.24
7,ATOM,8,CD1,ILE,A,30,137.024,149.071,137.604,1.0,53.24
8,ATOM,9,H1,ILE,A,30,138.745,148.665,138.998,1.0,53.24
9,ATOM,10,H2,ILE,A,30,139.666,148.501,140.311,1.0,53.24


In [308]:
# parse pdb file
f = open(fileB, mode='r')
tableB = pd.DataFrame()

for line in f:
    if line.startswith('ATOM'):
        row = line.split()[:11]
        tableB = pd.concat([tableB, pd.DataFrame([row])], ignore_index=True)

tableB[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,ATOM,1,N,GLY,A,29,140.459,146.466,140.125,1.0,80.44
1,ATOM,2,CA,GLY,A,29,139.095,147.026,140.135,1.0,81.22
2,ATOM,3,C,GLY,A,29,139.117,148.492,140.518,1.0,82.82
3,ATOM,4,O,GLY,A,29,140.202,148.992,140.858,1.0,81.5
4,ATOM,5,H1,GLY,A,29,140.453,145.486,139.882,1.0,80.44
5,ATOM,6,H2,GLY,A,29,141.043,146.944,139.454,1.0,80.44
6,ATOM,7,H3,GLY,A,29,140.891,146.559,141.036,1.0,80.44
7,ATOM,8,HA3,GLY,A,29,138.563,146.423,140.872,1.0,81.22
8,ATOM,9,HA2,GLY,A,29,138.698,146.856,139.133,1.0,81.22
9,ATOM,10,N,ILE,A,30,137.969,149.163,140.459,1.0,74.4


## Calculate RMSF in DataFrame

In [309]:
df = pd.merge(tableA, tableB, on=[2,3,4,5], how='inner', suffixes=('_A', '_B'))

In [310]:
# calc rmsf
df = df.astype({'6_A' : 'float', '7_A' : 'float', '8_A' : 'float', '6_B' : 'float', '7_B' : 'float', '8_B' : 'float'})
df['rmsf'] = ( ( (df['6_A'] - df['6_B'])**2 + (df['7_A'] - df['7_B'])**2 + (df['8_A'] - df['8_B'])**2 ) )
df['rmsf'] = df['rmsf'].apply(lambda x: math.sqrt(x))

# arrange columns
df = df[[2, 3,4,5, 'rmsf']]
df.columns = ['atom', 'residue', 'chain', 'resnum', 'rmsf']
df

Unnamed: 0,atom,residue,chain,resnum,rmsf
0,N,ILE,A,30,1.052953
1,CA,ILE,A,30,0.884547
2,C,ILE,A,30,1.304654
3,O,ILE,A,30,2.706368
4,CB,ILE,A,30,1.235941
...,...,...,...,...,...
5984,H,ALA,A,471,3.324287
5985,HA,ALA,A,471,6.093140
5986,HB1,ALA,A,471,10.020016
5987,HB2,ALA,A,471,8.389452


In [311]:
# ca
ca = df.loc[df['atom'] == 'CA', :].groupby(['residue', 'chain', 'resnum'], sort=False).mean(numeric_only=True).reset_index()
ca.rename(columns={'rmsf': 'ca'}, inplace=True)

# all
all = df.groupby(['residue', 'chain', 'resnum'], sort=False).mean(numeric_only=True).reset_index()
all.rename(columns={'rmsf': 'all'}, inplace=True)

# side
resi = df[~df['atom'].isin(['N', 'CA', 'C', 'O'])].groupby(['residue', 'chain', 'resnum'], sort=False).mean(numeric_only=True).reset_index()
resi.rename(columns={'rmsf': 'side'}, inplace=True)

# To DataFrame
rmsf = all.merge(resi, on=['residue', 'chain', 'resnum']).merge(ca, on=['residue', 'chain', 'resnum'])
rmsf['all'] = rmsf['all'].round(2)
rmsf['side'] = rmsf['side'].round(2)
rmsf['ca'] = rmsf['ca'].round(2)
rmsf

Unnamed: 0,residue,chain,resnum,all,side,ca
0,ILE,A,30,1.57,1.59,0.88
1,GLN,A,31,3.14,3.83,1.07
2,CYS,A,32,1.87,2.04,1.55
3,SER,A,33,1.90,1.91,1.76
4,GLN,A,34,1.62,1.91,0.58
...,...,...,...,...,...,...
372,GLY,A,467,1.32,1.18,1.28
373,LEU,A,468,3.03,3.33,1.87
374,ALA,A,469,1.38,1.28,1.49
375,LEU,A,470,6.20,7.32,2.33


## Define Loop residues
마에스트로에서 define에서 loop selection을 한 뒤, 다시 define에서 residue number로 선택하기를 선택한 후 하단에 selection을 누르면 현재 선택된 잔기들의 번호가 다음과 같이 뜬다. at.n 1-42,158-187,217-258 ... 그리고 하단에 Res Num을 클릭하면 residue number로 바꿔준다. 다음과 같다. res.num 28, 29, 30, 31 ...

In [312]:
loops = [29, 30, 31, 32, 33, 49, 50, 51, 52, 57, 58, 59, 60, 67, 99, 100, 101, 102, 103, 104, 105, 106, 121, 122, 133, 134, 135, 136, 137, 138, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 190, 191, 204, 205, 206, 211, 220, 221, 255, 256, 257, 274, 275, 276, 284, 285, 286, 287, 288, 289, 294, 295, 296, 297, 298, 305, 306, 332, 333, 334, 335, 336, 337, 338, 339, 353, 354, 362, 363, 364, 371, 372, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 424, 425, 440, 441, 442, 443, 444, 445, 459, 460, 461, 469, 470, 471]

## Display high RMSF residues

In [313]:
rmsf_A = rmsf[rmsf['chain'] == 'A']

import plotly.graph_objs as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=rmsf_A['resnum'], y=rmsf_A['all'], name='All'))
fig.add_trace(go.Scatter(x=rmsf_A['resnum'], y=rmsf_A['side'], name='Side Chain'))
fig.add_trace(go.Scatter(x=rmsf_A['resnum'], y=rmsf_A['ca'], name='Ca'))
fig.update_layout(title='RMSF', xaxis_title='Residue Number', yaxis_title='RMSF (Å)', width=1200, height=800)
fig.show()

In [314]:
# copy and paste printed text to maestro commands input
chain = 'A'
rmsf_criteria = 2

# rmsf > 2, all
res_list = rmsf.loc[(rmsf['chain'] == chain) & (rmsf['all'] > rmsf_criteria), :]['resnum'].tolist()
res_list_int = [int(x) for x in res_list]
res_list_int_notloop = [item for item in res_list_int if item not in loops]
print(f"workspaceselectionreplace (chain.name {chain}) AND (res.num {str(res_list_int_notloop).replace('[','').replace(']','')})")

# rmsf > 2, Ca
res_list = rmsf.loc[(rmsf['chain'] == chain) & (rmsf['ca'] > rmsf_criteria), :]['resnum'].tolist()
res_list_int = [int(x) for x in res_list]
res_list_int_notloop = [item for item in res_list_int if item not in loops]
print(f"workspaceselectionreplace (chain.name {chain}) AND (res.num {str(res_list_int_notloop).replace('[','').replace(']','')})")

workspaceselectionreplace (chain.name A) AND (res.num 35, 56, 61, 62, 63, 64, 65, 66, 68, 72, 83, 97, 140, 163, 179, 180, 182, 183, 185, 186, 189, 259, 291, 293, 300, 309, 365, 366, 367, 368, 369, 370, 375, 394, 464, 468)
workspaceselectionreplace (chain.name A) AND (res.num 62, 63, 64, 65, 66, 68, 179, 180, 182, 365, 366, 367, 368, 369)


In [241]:
rmsf.to_csv('5.csv', index=False)