<a href="https://colab.research.google.com/github/porekhov/CG_probeMD/blob/main/colabind_probeMD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title **Install Conda Colab**

# @markdown **Important note:** Select the Runtime > "Change runtime type" menu to enable a GPU before you run this notebook. It is required for efficient molecular dynamics simulations using the OpenMM engine, https://openmm.org/.


# @markdown The kernel will crush and restart during the installation (it's normal).

# @markdown Please, don't run this notebook using the "Run all" option.

%%capture
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
#@title **Check wherher you connected a GPU runtime**

# @markdown Run this cell to check if the Colab is using a GPU runtime.

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print('The GPU is enabled.')
  print(gpu_info)

In [None]:
#@title **Install dependencies**
# @markdown It will take up to 4-5 minutes to install openMM and other dependencies
%%capture
!conda install -c omnia openmm
!conda install -c conda-forge mdtraj -y
!conda install -c conda-forge pdbfixer -y
!conda install -c conda-forge mdanalysis -y
!pip install py3Dmol
!pip install vermouth
# install martini_openmm from git
%cd /content
!git clone https://github.com/maccallumlab/martini_openmm.git
%cd /content/martini_openmm
#!sed -i '269s/.*/        with open(file, encoding="utf-8") as lines:/' /content/martini_openmm/martini_openmm/martini.py
!python setup.py install
%cd /content

import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
#@title **Upload target PDB file and choose concentrations of probes**
# @markdown Upload a pdb file with the target and choose the chain(s) to be simulated (upon running the cell, scroll down and click "upload").

# @markdown The uploaded file will be processed with pdbfixer, https://github.com/openmm/pdbfixer. So, minor problems such as missing heavy atoms, non-standard residue names, etc. should be automatically fixed.

#@markdown Choose chain(s), comma separated:
ChainID = "A" #@param {type:"string"}

#@markdown Set relative concentrations for water/probes:

#@markdown Water:
W = 0.952 #@param {type:"number"}
#@markdown Phenol:
PHEN = 0.003 #@param {type:"number"}
#@markdown Acetic acid:
ACET = 0.009 #@param {type:"number"}
#@markdown Isopropyl amide:
IPA = 0.009 #@param {type:"number"}
#@markdown Dimethylacetamide:
DMAD = 0.009 #@param {type:"number"}
#@markdown Isopropanol:
IPO = 0.009 #@param {type:"number"}
#@markdown Acetone:
PPN = 0.009 #@param {type:"number"}

from google.colab import files
import MDAnalysis as mda
import mdtraj as md
import warnings
warnings.filterwarnings('ignore')

![ ! -d "/content/simulation" ] && mkdir /content/simulation
%cd /content/simulation
! [ -n "$(find . -maxdepth 1 -type f -name '*.pdb')" ] && rm *pdb
pdb_file = files.upload()
pdb_file_name = list(pdb_file.keys())[0]

u = mda.Universe(pdb_file_name)
sel = u.select_atoms('protein and segid ' + ChainID.replace(',', ' '))
sel.write('all_atom_selected.pdb')

![ ! -d "/content/simulation/openmm_probeMD" ] && git clone https://github.com/porekhov/openmm_probeMD.git
!cp -r /content/simulation/openmm_probeMD/* /content/simulation

# fix PDB
import pdbfixer
from openmm.app import PDBFile
fixer = pdbfixer.PDBFixer(filename = 'all_atom_selected.pdb')
fixer.findMissingResidues()
fixer.findMissingAtoms()
fixer.addMissingAtoms()
fixer.findNonstandardResidues()
fixer.replaceNonstandardResidues()
PDBFile.writeFile(fixer.topology, fixer.positions, open('aa_fixed.pdb', 'w'))

# define the secondary structure
str_ini = md.load('aa_fixed.pdb', top='aa_fixed.pdb')
ss = md.compute_dssp(str_ini, simplified=True)[0]
ss = "'" + ''.join(ss) + "'"

# string with the insane.py keys setting
# the selected probes and their concentrations
sol_str = ' -sol W:' + str(W) + ' -sol PHEN:' + str(PHEN) + \
' -sol ACET:' + str(ACET) + ' -sol IPA:'+ str(IPA) + \
' -sol DMAD:' + str(DMAD) + ' -sol IPO:' + str(IPO) + \
' -sol PPN:' + str(PPN)

# create the topology file
with open('topol.top', 'w') as f_out:
    f_out.write('#include "_martini/martini_v3.0.0.itp"\n')
    f_out.write('#include "_martini/martini_v3.0.0_ions_v1.itp"\n')
    f_out.write('#include "_martini/martini_v3.0.0_probes_v1.itp"\n')
    f_out.write('#include "_martini/martini_v3.0.0_solvents_v1.itp"\n')
    f_out.write('#include "molecule_0.itp"\n')
    f_out.write('[ system ]\n')
    f_out.write('CG system for ' + pdb_file_name + '\n')
    f_out.write('[ molecules ]\n')
    f_out.write('molecule_0    1\n')

with open('prep.sh', 'w') as f_out:
    f_out.write('martinize2 -maxwarn 100 -f aa_fixed.pdb -x _cg.pdb -o _topol.top -scfix -cys auto  -elastic -p backbone -nt -merge ' + ChainID + ' -ss ' + ss + '\n')
    f_out.write('python insane_probes.py -f _cg.pdb -o _cg_sol.gro -p _insane.top -salt 0 -charge auto ' + sol_str + ' -pbc cubic -d 3.0' + '\n')
    f_out.write("sed -n -E '/^(CLBZ|PHEN|BENZ|ACE|IPA|DMAD|IPO|PPN|NA|CL|W)/p' _insane.top >> topol.top")
!chmod +x prep.sh
!./prep.sh
!sed -i '/^;/d' molecule_0.itp

In [None]:
#@title **Run the simulations**
# @markdown It will take about 15-20 minutes to obtain a 10 0ns trajectory for a protein with 150 a.a.
# @markdown After energy minimization, two equilibration simulations are run in NVT and NPT ensembles followed by the production simulation in NVT.

#@markdown Temperature (in Kelvin)
Temperature = 303.15 #@param {type:"string"}
temp = float(Temperature)
#@markdown Number of steps in the production simulation (* 0.02 ps = total simulation time)
Num_steps = 5000000 #@param {type:"integer"}
nsteps = Num_steps

%cd /content/simulation

from openmm import unit as u
import openmm as mm
from openmm import app
from openmm.app import StateDataReporter
import martini_openmm.martini_openmm as martini
from mdtraj.reporters import XTCReporter

epsilon_r = 15.0
gpu_id = '0'

platform = mm.Platform.getPlatformByName("CUDA")
properties = {'DeviceIndex': gpu_id, 'Precision': 'single'}
defines = {}

### Minimization and NVT eq ###
conf = app.GromacsGroFile("_cg_sol.gro")
box_vectors = conf.getPeriodicBoxVectors()

top = martini.MartiniTopFile("topol.top",
                             periodicBoxVectors=box_vectors,
                             defines=defines,
                             epsilon_r=epsilon_r)

system = top.create_system(nonbonded_cutoff=1.1 * u.nanometer)
integrator = mm.LangevinIntegrator(temp * u.kelvin, 1.0 / u.picosecond, 1 * u.femtosecond)
md = mm.app.Simulation(top.topology, system, integrator, platform, properties)
md.context.setPositions(conf.positions)
md.context.setVelocitiesToTemperature(temp*u.kelvin)
# minimize
print('Minimizing...')
md.minimizeEnergy(maxIterations=5000,tolerance=1.0)
print('Running the NVT equilibration...')
md.step(50000)
system.addForce(mm.openmm.MonteCarloBarostat(1*u.bar, temp*u.kelvin))
integrator.setStepSize(20 * u.femtosecond)
print('Running the NPT equilibration...')
md.step(500000)
xtc_reporter = XTCReporter('prod.xtc', 500)
md.reporters.append(xtc_reporter)
integrator.setFriction(10.0 / u.picosecond)
print('Running the production simulation for ', round(nsteps * 0.00002, 2), ' ns...')
md.step(nsteps)

In [None]:
#@title **Preprocess the trajectory**
# @markdown Firstly, each frame is translated to the protein's center of mass, then solvent and probes are wraped around the protein, and, finally, each frame is aligned to the protein again using the roto-translational fit.

import MDAnalysis as mda
import MDAnalysis.transformations

u = mda.Universe('_cg_sol.gro', 'prod.xtc', in_memory=True)

u_bonds = mda.Universe('_cg.pdb')
bonds = [(i[0].index, i[1].index) for i in u_bonds.bonds]
for trp in u_bonds.select_atoms('resname TRP').residues:
    bonds.append((trp.atoms[3].index, trp.atoms[4].index))
u.add_TopologyAttr('bonds', bonds)

prot = u.select_atoms('not resname W ION PHEN ACET IPA DMAD IPO PPN')
prot_BB = u.select_atoms('name BB and not resname W ION PHEN ACET IPA DMAD IPO PPN')
probes = u.select_atoms('resname PHEN ACET IPA DMAD IPO PPN')
sel_prot_probes = u.select_atoms('not resname W ION')
sel_all = u.select_atoms('all')

u_ref = mda.Universe("_cg_sol.gro")
prot_ref = u_ref.select_atoms('name BB and not resname W ION PHEN ACET IPA DMAD IPO PPN')

workflow = [mda.transformations.unwrap(prot),
            mda.transformations.fit_translation(prot_BB, prot_ref),
            mda.transformations.wrap(probes),
            mda.transformations.fit_rot_trans(prot_BB, prot_ref)]

u.trajectory.add_transformations(*workflow)

sel_prot_probes.write("prod_wraped.pdb")

with mda.Writer("prod_wraped.xtc", sel_prot_probes.n_atoms) as W:
    for ts in u.trajectory:
        W.write(sel_prot_probes)

u.trajectory[-1]
sel_all.write('prod_last.gro')
print('Done.')

In [None]:
#@title **Calculate densities**
# @markdown
from MDAnalysis.analysis import align
from MDAnalysis.analysis.density import DensityAnalysis
from gridData import Grid
from sklearn.cluster import MeanShift
import numpy as np

# bandwidth for MeanShift
bandwidth = 6.0

# calculate bulk densities
u = mda.Universe('prod_last.gro')
n_water = u.select_atoms('resname W').n_atoms * 4

probes = []
probe_resnames = ['PHEN', 'ACET', 'IPA', 'DMAD', 'IPO', 'PPN']
probe_names = ['PHE', 'ACT', 'IPA', 'DMA', 'IPL', 'ACN']

n_probe = u.select_atoms('not name VS* and (resname ' + ' '.join(probe_resnames) + ')', updating=True).n_atoms
n0 = n_probe/(u.dimensions[0]*u.dimensions[1]*u.dimensions[2])
# uncomment to calculate n0 as in 10.1073/pnas.2214024119
#n0 = 6.02e23*55.56*n_probe/n_water/10e27
probes.append([probe_resnames, 'ALL', n0])

for p, n in zip(probe_resnames, probe_names):
    n_probe = u.select_atoms('not name VS* and (resname ' + p + ')', updating=True).n_atoms
    n_probe/(u.dimensions[0]*u.dimensions[1]*u.dimensions[2])
    #n0 = 6.02e23*55.56*n_probe/n_water/10e27
    probes.append([[p], n, n0])

grid_d = 2 # grid spacing
kt = 0.0083188*temp # kT value for cut-off

# align the AA model to the CG model

ref = mda.Universe('prod_wraped.pdb')
ref.select_atoms('name BB').names = 'CA'
mobile = mda.Universe('aa_fixed.pdb')

align.alignto(mobile, ref, select='name CA and not altloc B C D E', match_atoms=False)
mobile.select_atoms('protein').write('aa_aligned.pdb')

u = mda.Universe('prod_wraped.pdb', 'prod_wraped.xtc')

protein = u.select_atoms('protein')
protein_COM = protein.center_of_mass()

for p in probes:
    # density calculation
    p_name = p[0]
    out_name = p[1]
    n0 = p[2]

    if n0 == 0:
        continue

    p_sel = u.select_atoms('not name VS* and (resname ' + ' '.join(p_name) + ' and around 6 protein)', updating=True)

    D = DensityAnalysis(p_sel, delta=grid_d, gridcenter=protein_COM, \
                        xdim=u.dimensions[0], ydim=u.dimensions[1], zdim=u.dimensions[2])
    D.run()

    grid = D.results.density.grid
    grid = np.log(grid/n0)
    grid[np.isneginf(grid)] = 0 # remove inf due to log(0)
    grid = (-8.314*temp/1000)*grid # density -> kJ/mol
    grid = grid*(grid < -1*kt)
    D.results.density.grid = grid
    D.results.density.export('dens_' + out_name + '.dx')
    print("Wrote file for: {}, min value: {:.2f}".format(out_name, grid.min()))
print('Done.')

In [None]:
#@title **Density clustering**
# @markdown Change the bandwidth parameter if you are not satisfied with clustering.
# @markdown Check for more details, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html
# @markdown Set to 0 for automatic estimation, however, 6 usually performs good.

Bandwidth = 6 #@param {type:"number"}
bandwidth = float(Bandwidth)

def pdb_line(at_num, at_type, res_type, chain_id, res_num, xyz, occ, temp):
    # returns a pdb-formatted line
    return "%6s%5s %4s %3s %1s%4d    %8.3f%8.3f%8.3f%6.2f%6.2f" \
            % ("ATOM  ", at_num, at_type, res_type, chain_id, res_num, xyz[0], xyz[1], xyz[2], occ, temp)

f_out_centers_mean_ene = open('centers_mean_ene.pdb', 'w')
f_out_centers_min_ene = open('centers_min_ene.pdb', 'w')
f_out_centers_sum_ene = open('centers_sum_ene.pdb', 'w')
f_out_clusters_mean_ene = open('clusters_mean_ene.pdb', 'w')
f_out_clusters_min_ene = open('clusters_min_ene.pdb', 'w')
f_out_clusters_sum_ene = open('clusters_sum_ene.pdb', 'w')

for p in probes:
    p_name = p[0]
    out_name = p[1]
    n0 = p[2]

    if n0 == 0:
        continue

    G = Grid('dens_' + out_name + '.dx')

    grid = G.grid
    edges = G.edges
    hotspots, energies = [], []

    for i in range(grid.shape[0]):
        for j in range(grid.shape[1]):
            for k in range(grid.shape[2]):
                if grid[i,j,k] < 0:
                    hotspots.append([edges[0][i], edges[1][j], edges[2][k]])
                    energies.append(grid[i,j,k])

    hotspots = np.array(hotspots)
    energies = np.array(energies)
    if bandwidth != 0:
        clustering = MeanShift(bandwidth=bandwidth).fit(hotspots)
    else:
        clustering = MeanShift().fit(hotspots)

    clusters = {}
    for i in np.unique(clustering.labels_):
        if energies[clustering.labels_ == i].sum() < -10:
            clusters[i] = [hotspots[clustering.labels_ == i].shape[0], \
                   energies[clustering.labels_ == i].mean(), \
                   energies[clustering.labels_ == i].min(), \
                   energies[clustering.labels_ == i].sum(), \
                   hotspots[clustering.labels_ == i][energies[clustering.labels_ == i].argmin()], \
                   hotspots[clustering.labels_ == i].mean(axis = 0)]

    # clusters[id] = [size, mean energy, min energy, sum energy, [position of min energy], [center of cluster]]
    clusters_mean_sort = [i[0] for i in sorted(clusters.items(), \
                         key=lambda item: item[1][1], reverse=False)]
    clusters_min_sort = [i[0] for i in sorted(clusters.items(), \
                         key=lambda item: item[1][2], reverse=False)]
    clusters_sum_sort = [i[0] for i in sorted(clusters.items(), \
                          key=lambda item: item[1][3], reverse=False)]

    # pdb_line(at_num, at_type, res_type, chain_id, res_num, x, y, z, occ, temp)
    # writing centers of clusters, sorted by mean energy
    for i, j in enumerate(clusters_mean_sort):
        f_out_centers_mean_ene.write(pdb_line(i + 1, out_name, out_name, 'A', i + 1, clusters[j][4], clusters[j][2], clusters[j][1]) + '\n')
    # writing centers of clusters, sorted by min energy
    for i, j in enumerate(clusters_min_sort):
        f_out_centers_min_ene.write(pdb_line(i + 1, out_name, out_name, 'A', i + 1, clusters[j][4], clusters[j][1], clusters[j][2]) + '\n')
    # writing centers of clusters, sorted by sum energy
    for i, j in enumerate(clusters_sum_sort):
        f_out_centers_sum_ene.write(pdb_line(i + 1, out_name, out_name, 'A', i + 1, clusters[j][5], clusters[j][1], clusters[j][3]) + '\n')
    # writing clusters, sorted by mean energy
    k = 1
    for i, j in enumerate(clusters_mean_sort):
        xyz = hotspots[clustering.labels_ == j]
        ene = energies[clustering.labels_ == j]
        for spot in zip(ene, xyz):
            f_out_clusters_mean_ene.write(pdb_line(k, out_name, out_name, 'A', i + 1, spot[1], spot[0], clusters[j][1]) + '\n')
            k += 1

    # writing clusters, sorted by min energy
    k = 1
    for i, j in enumerate(clusters_min_sort):
        xyz = hotspots[clustering.labels_ == j]
        ene = energies[clustering.labels_ == j]
        for spot in zip(ene, xyz):
            f_out_clusters_min_ene.write(pdb_line(k, out_name, out_name, 'A', i + 1, spot[1], spot[0], clusters[j][2]) + '\n')
            k += 1

    # writing clusters, sorted by sum energy
    k = 1
    for i, j in enumerate(clusters_sum_sort):
        xyz = hotspots[clustering.labels_ == j]
        ene = energies[clustering.labels_ == j]
        for spot in zip(ene, xyz):
            f_out_clusters_sum_ene.write(pdb_line(k, out_name, out_name, 'A', i + 1, spot[1], spot[0], clusters[j][3]) + '\n')
            k += 1

f_out_centers_mean_ene.close()
f_out_centers_min_ene.close()
f_out_centers_sum_ene.close()
f_out_clusters_mean_ene.close()
f_out_clusters_min_ene.close()
f_out_clusters_sum_ene.close()
print('Done.')


In [None]:
#@title **Run to download PDBs with results**

# @markdown The following files are created:

# @markdown **1. centers_mean_ene.pdb** Clusters sorted by their mean free energy (from the lowest to the highest). Each cluster is represented by a single hotspot with the mininal energy. Residue name = probe type (ALL for the joint density); Residue number = cluster rank; beta factor = the cluster mean energy.

# @markdown **2. centers_min_ene.pdb** Clusters sorted by their minimal free energy (from the lowest to the highest). Each cluster is represented by a single hotspot with the mininal energy. Residue name = probe type (ALL for the joint density); Residue number = cluster rank; beta factor = the cluster minimal energy.

# @markdown **3. centers_sum_ene.pdb** Clusters sorted by their total free energy (from the lowest to the highest). Each cluster is represented by a single hotspot with the mininal energy. Residue name = probe type (ALL for the joint density); Residue number = cluster rank; beta factor = the cluster total energy.

# @markdown **4. clusters_mean_ene.pdb** Clusters sorted by their mean free energy (from the lowest to the highest). Each cluster is represented by all belonging hotspots. Residue name = probe type (ALL for the joint density); Residue number = cluster rank; occupancy = hotspot energy; beta factor = the cluster mean energy.

# @markdown **5. clusters_min_ene.pdb**  Clusters sorted by their minimal free energy (from the lowest to the highest). Each cluster is represented by all belonging hotspots. Residue name = probe type (ALL for the joint density); Residue number = cluster rank; occupancy = hotspot energy; beta factor = the cluster mininal energy.

# @markdown **6. clusters_sum_ene.pdb** Clusters sorted by their total free energy (from the lowest to the highest). Each cluster is represented by all belonging hotspots. Residue name = probe type (ALL for the joint density); Residue number = cluster rank; occupancy = hotspot energy; beta factor = the cluster total energy.

# @markdown File **aa_aligned.pdb** is an all-atom model aligned to the coarse-grained one (it is stored in **_cg.pdb**), which can be used as a reference structure for visualization and interpretation of the results.

![ -d "/content/simulation/results" ] && rm -fr results
!mkdir results
!cp *pdb results
!zip -r results.zip results
files.download("results.zip")

In [None]:
#@title **Show clusters**
# @markdown Choose the probe type from the dropdown list and select the number of top-ranked clusters to show.

probe_type   = "ALL"       #@param ['ALL', 'PHE', 'ACT', 'IPA', 'DMA', 'IPL', 'ACN']
nclusters    = 5           #@param {type:"integer"}

import py3Dmol

view = py3Dmol.view(width=400, height=300)
view.addModel(open('aa_aligned.pdb', 'r').read(),'pdb')
view.addModel(open('clusters_min_ene.pdb', 'r').read(),'pdb')
view.setStyle({"cartoon": {'color': 'spectrum'}})

for i in range(1, nclusters + 1):
    sel = {'resn': probe_type, 'resi':i, 'model':-1}
    view.setStyle(sel,{'sphere':{'colorscheme':{'prop':'resi','gradient':'roygb','min':1,'max':10}}})
    view.addLabel(str(i), {'fontColor':'black', 'backgroundColor':'lightgray'},
                {'model':-1, 'resn': probe_type,  'resi': i})

view.zoomTo()
view.setBackgroundColor('white')
view.show()

In [None]:
#@title **Run to download densities**
# @markdown You can visualize densities using VMD or Pymol.
# @markdown Alternatively, you can try the basic visualising snippet below.
![ -d "/content/simulation/densities" ] && rm -fr densities
!mkdir densities
!cp *dx densities
!cp aa_aligned.pdb densities
!zip -r densities.zip densities
files.download("densities.zip")

In [None]:
#@title **Run to download trajectory**
# @markdown This archive contains the processed trajectory with the protein and probes alone (i.e., no water).
# @markdown In order to download the raw files:
# @markdown 1. Click the Files menu  in the right panel;
# @markdown 2. Navigate to /content/simulation/;
# @markdown 3. Download prod.xtc and _cg_sol.gro.
![ -d "/content/simulation/trajectory" ] && rm -fr trajectory
!mkdir trajectory
!cp prod_wraped.pdb prod_wraped.xtc trajectory
!zip -r trajectory.zip trajectory
files.download("trajectory.zip")