In [31]:
import openbabel
import MDAnalysis as mda
import multiprocessing as mp
import nglview as nv
import pandas as pd
import pybel
import py3Dmol

from Bio.PDB    import PDBParser
from meeko      import MoleculePreparation, PDBQTWriterLegacy
from rdkit      import Chem
from rdkit.Chem import AllChem
from vina       import Vina
from pdbfixer   import PDBFixer
from openmm.app import PDBFile

In [2]:
mp.cpu_count()

20

# Chemdbl

In [17]:
%%time

chembl = pd.read_table('data/chembl/chembl_33_chemreps.txt', sep='\t')

CPU times: user 8.98 s, sys: 856 ms, total: 9.84 s
Wall time: 9.88 s


In [18]:
chembl

Unnamed: 0,chembl_id,canonical_smiles,standard_inchi,standard_inchi_key
0,CHEMBL153534,Cc1cc(-c2csc(N=C(N)N)n2)cn1C,InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...,MFRNFCWYPYSFQQ-UHFFFAOYSA-N
1,CHEMBL440060,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...,InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-11...,RSEQNZQKBMRQNM-VRGFNVLHSA-N
2,CHEMBL440245,CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(...,InChI=1S/C160H268N50O41/c1-23-27-41-95-134(228...,FTKBTEIKPOYCEX-OZSLQWTKSA-N
3,CHEMBL440249,CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC...,InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(17...,UYSXXKGACMHPIM-KFGDMSGDSA-N
4,CHEMBL405398,Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1,InChI=1S/C19H21BrN6O/c20-15-2-1-3-17(18(15)22-...,VDSXZXJEWIWBCG-UHFFFAOYSA-N
...,...,...,...,...
2372669,CHEMBL4298696,CCCCCCCCCCCCCCCCCCPCCCCCCCCCCCCCC.F[PH](F)(F)(...,InChI=1S/C32H67P.F6HP/c1-3-5-7-9-11-13-15-17-1...,ZAKUDCIPPLAGQL-UHFFFAOYSA-N
2372670,CHEMBL4298698,C[n+]1cn([C@@H]2O[C@H](CO[P@@](=O)(S)OP(=O)([O...,InChI=1S/C11H18N5O13P3S/c1-15-3-16(8-5(15)9(19...,OTIKKVINVWNBOQ-LDJOHHLFSA-N
2372671,CHEMBL4298702,c1ccc(C2CC(C3CC(c4ccccc4)OC(c4ccccc4)C3)CC(c3c...,InChI=1S/C34H34O2/c1-5-13-25(14-6-1)31-21-29(2...,NZIGZXNUFVMHNV-UHFFFAOYSA-N
2372672,CHEMBL4298703,CSCC[C@H](NC=O)C(=O)N[C@@H](CCCNC(=N)NS(=O)(=O...,InChI=1S/C78H107N18O21PS2/c1-43-44(2)65(45(3)5...,IIHLOGWTFCCTPB-WTIPWMETSA-N


In [None]:
smiles = chembl.canonical_smiles.tolist()

# Processing

In [None]:
def smiles_to_rdkit_molecule(smiles_string):
    # Convert the SMILES string to an RDKit molecule object
    smiles_molecule = Chem.MolFromSmiles(smiles_string)

    # Check if the molecule was created successfully
    if smiles_molecule is None:
        print("Error: Invalid SMILES string or failed to create molecule.")
        return None, False

    try:
        # Sanitize the molecule to check for valence problems
        Chem.SanitizeMol(smiles_molecule)

        # Add hydrogen atoms to the molecule
        molecule_with_hydrogens = Chem.AddHs(smiles_molecule)

        # Generate 3D coordinates and check if embedding was successful
        if AllChem.EmbedMolecule(molecule_with_hydrogens, AllChem.ETKDG()) >= 0:
            # Optionally, optimize the geometry
            AllChem.UFFOptimizeMolecule(molecule_with_hydrogens)
            return molecule_with_hydrogens, True
        else:
            print("3D embedding was not successful.")
            return molecule_with_hydrogens, False

    except ValueError as e:
        print(f"Valence Error: {e}")
        return None, False

In [None]:
def parallel_smiles_to_rdkit_molecule(list_smiles_string, num_cores=mp.cpu_count()):
    
    with mp.Pool(num_cores) as pool:
        rdkit_objects, success  = pool.map(smiles_to_rdkit_molecule, list_smiles_string)
        
    return rdkit_objects, success

# Get molfile and save it to .pdb

In [None]:
%%time

molfile, success = smiles_to_rdkit_molecule(smiles[3])

In [None]:
molfile

In [None]:
Chem.MolToPDBFile(molfile, "data/molecule.pdb")

In [32]:
angptl7 = Chem.MolFromPDBFile('data/angptl7.pdb', removeHs=False)

In [33]:
molfile = Chem.MolFromPDBFile('data/molecule.pdb', removeHs=False)

In [35]:
def visualize_molecule(molecule, width=400, height=300):
    """
    Visualize an RDKit molecule using Py3Dmol.
    :param molecule: RDKit molecule object
    """
    mb     = Chem.MolToMolBlock(molecule)
    viewer = py3Dmol.view(width=width, height=height)
    viewer.addModel(mb, 'mol')
    viewer.setStyle({'stick': {}})
    viewer.zoomTo()
    return viewer.show()

In [36]:
visualize_molecule(molfile, 800, 600)

In [37]:
visualize_molecule(angptl7, 800, 600)

In [None]:
Chem.MolToPDBFile(molfile, "data/molecule_new.pdb")

# Fixing the protein

In [None]:
ph = 7.4  

In [None]:
pdb_filename = 'data/angptl7.pdb'  
fixer        = PDBFixer(filename=pdb_filename)

fixer.findMissingResidues()
fixer.findMissingAtoms()
fixer.addMissingAtoms()
fixer.addMissingHydrogens(ph)

PDBFile.writeFile(fixer.topology, fixer.positions, open('data/angptl7_fixed.pdbqt'  , 'w'))

# Fixing the molecule 

In [3]:
def fix_valence_issues(mol):
    """
    Attempt to fix valence issues in a molecule.
    :param mol: RDKit molecule object
    :return: Fixed RDKit molecule object, or None if unable to fix
    """
    try:
        # Sanitize the molecule (checks valences, aromaticity, etc.)
        Chem.SanitizeMol(mol)

        # Add hydrogens
        mol = Chem.AddHs(mol)

        # Attempt to perform a 3D embedding, increasing the number of attempts
        num_attempts = 10
        for attempt in range(num_attempts):
            if AllChem.EmbedMolecule(mol, AllChem.ETKDGv3()) >= 0:
                AllChem.MMFFOptimizeMolecule(mol)
                print("3D embedding successful.")
                return mol
            else:
                print(f"Embedding attempt {attempt + 1} failed.")

        print("Warning: All embedding attempts failed, optimization skipped")
        return None
    except Exception as e:
        print(f"Error fixing molecule: {e}")
        return None

In [4]:
pdb_file = 'data/molecule_new.pdb'  
mol = Chem.MolFromPDBFile(pdb_file, removeHs=False)

if mol:
    fixed_mol = fix_valence_issues(mol)
    if fixed_mol:
        print("Molecule fixed successfully.")
    else:
        print("Unable to fix the molecule.")
else:
    print("Error reading the molecule.")

3D embedding successful.
Molecule fixed successfully.


In [None]:
Chem.MolToPDBFile(fixed_mol, "data/molecule_fixed.pdb")

In [48]:
smiles_string = "Cc1cc(-c2csc(N=C(N)N)n2)cn1C"  # Replace with your SMILES string

# Define the file path
file_path = "data/molecule.smi"

# Write the SMILES string to the file
with open(file_path, "w") as file:
    file.write(smiles_string + "\n")

# Preparation

In [5]:
preparator = MoleculePreparation()
mol_setups = preparator.prepare(fixed_mol)
for setup in mol_setups:
    pdbqt_string, is_ok, error_msg = PDBQTWriterLegacy.write_string(setup)
    if is_ok:
        print(pdbqt_string, end="")

REMARK SMILES CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC(=O)[C@@H]([C@@H](C)O)NC(=O)[C@H](c2ccc(O[C@H]3O[C@H](CO)[C@@H](O)[C@H](O)[C@@H]3O[C@H]3O[C@H](CO)[C@@H](O)[C@H](O)[C@@H]3O)cc2)NC(=O)[C@@H](CCCN)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H]([C@@H](C)O)NC(=O)[C@@H](c2ccc(O)cc2)NC(=O)[C@H](c2ccc(O)cc2)NC(=O)[C@@H](C(C)C)NC(=O)[C@@H](CCCN)NC(=O)[C@@H](c2ccc(O)cc2)NC(=O)[C@@H](CNC(=O)[C@H](CC(N)=O)NC(=O)Cc2cccc3ccccc23)[C@@H](C(N)=O)OC(=O)[C@H](c2ccc(O)c(Cl)c2)NC(=O)[C@@H](C)NC1=O
REMARK SMILES IDX 5 1 6 2 7 3 8 4 9 5 10 6 11 7 12 8 13 9 21 10 22 11 23 12
REMARK SMILES IDX 24 13 28 14 29 15 30 16 31 17 61 18 62 19 63 20 64 21 69 22
REMARK SMILES IDX 70 23 71 24 72 25 80 26 81 27 82 28 83 29 87 30 88 31 89 32
REMARK SMILES IDX 90 33 98 34 99 35 100 36 101 37 109 38 110 39 111 40 112 41
REMARK SMILES IDX 116 42 117 43 118 44 119 45 124 46 125 47 126 48 127 49
REMARK SMILES IDX 135 50 136 51 137 52 138 53 162 54 166 55 167 56 168 57
REMARK SMILES IDX 169 58 178 59 179 60 180 61 181 62 182 63 1

# Calculate geometric center

In [38]:
def calculate_geometric_center(pdb_filename):
    parser = PDBParser()
    structure = parser.get_structure('structure', pdb_filename)
    atom_coords = [atom.get_coord() for atom in structure.get_atoms()]
    x_coords = [coord[0] for coord in atom_coords]
    y_coords = [coord[1] for coord in atom_coords]
    z_coords = [coord[2] for coord in atom_coords]
    center_x = sum(x_coords) / len(x_coords)
    center_y = sum(y_coords) / len(y_coords)
    center_z = sum(z_coords) / len(z_coords)
    return [center_x, center_y, center_z]

In [39]:
pdb_filename = 'data/angptl7_fixed.pdbqt'
center = calculate_geometric_center(pdb_filename)
print('Geometric center:', center)

Geometric center: [10.273092066516384, 2.421546035539128, -3.6259281597777466]




# Box size

In [40]:
def calculate_optimized_box_size(file_path, buffer=10):
    """
    Calculate an optimized box size for docking based on a PDBQT file.
    :param file_path: Path to the PDBQT file
    :param buffer: Extra space around the molecule (in Angstroms)
    :return: List of box dimensions [x, y, z]
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Filter out lines that do not start with ATOM or HETATM
    atom_lines = [line for line in lines if line.startswith("ATOM") or line.startswith("HETATM")]

    # Parse coordinates from these lines
    coordinates = []
    for line in atom_lines:
        parts = line.split()
        if len(parts) >= 8:  # Check if there are enough parts to include coordinates
            try:
                # The coordinates are typically in columns 6, 7, and 8 in PDB format
                x, y, z = float(parts[5]), float(parts[6]), float(parts[7])
                coordinates.append((x, y, z))
            except ValueError:
                # Handle case where conversion to float fails
                continue

    if not coordinates:
        raise ValueError("No valid atom coordinates found in file")

    # Calculate the axis-aligned bounding box
    xs, ys, zs = zip(*coordinates)
    box_size = [max(xs) - min(xs) + buffer, max(ys) - min(ys) + buffer, max(zs) - min(zs) + buffer]
    return box_size

In [44]:
file_path = 'data/molecule_fixed.pdbqt' 
try:
    box_size = calculate_optimized_box_size(file_path)
    print("Optimized Box Size:", box_size)
except ValueError as e:
    print(e)

Optimized Box Size: [10.0, 47.561, 36.918]


# Docking

In [11]:
num_cores = 14

In [19]:
v = Vina(sf_name='vina', cpu=num_cores)

v.set_receptor('data/angptl7_fixed.pdbqt')
v.set_ligand_from_string(pdbqt_string)

In [14]:
protein_path = 'data/angptl7_fixed.pdbqt'
ligand_path  = 'data/molecule_fixed.pdbqt'

In [15]:
%%time

v.compute_vina_maps(
    center   = calculate_geometric_center(protein_path), 
    box_size = calculate_optimized_box_size(ligand_path))



Computing Vina grid ... done.
CPU times: user 1.14 s, sys: 75.8 ms, total: 1.21 s
Wall time: 1.21 s


In [46]:
%%time

v.compute_vina_maps(
    center   = [-13.350000381469727, 10.0, 21.81999969482422],
    box_size = [50,68,47])

CPU times: user 3.13 s, sys: 157 ms, total: 3.28 sComputing Vina grid ... 
Wall time: 3.24 s




done.


In [47]:
%%time

# Score the current pose
energy = v.score()
print('Score before minimization: %.3f (kcal/mol)' % energy[0])

RuntimeError: 

Vina runtime error: The ligand is outside the grid box. Increase the size of the grid box or center it accordingly around the ligand.


In [14]:
%%time

# Minimize locally the current pose
energy_minimized = v.optimize()
print('Score after minimization : %.3f (kcal/mol)' % energy_minimized[0])
v.write_pose('data/1iep_ligand_minimized.pdbqt', overwrite=True)

Performing local search ... Score after minimization : -1.668 (kcal/mol)
done.
CPU times: user 42.2 ms, sys: 2.01 ms, total: 44.2 ms
Wall time: 44.3 ms


In [None]:
%%time

# Dock the ligand
v.dock(exhaustiveness=8, n_poses=5)
v.write_poses('data/1iep_ligand_vina_out.pdbqt', n_poses=5, overwrite=True)

# Parse output

In [None]:
def parse_vina_output(vina_output_file):

    with open(vina_output_file, 'r') as file:
        lines = file.readlines()
    poses = []
    current_pose = []
    binding_affinity = None
    for line in lines:
        if line.startswith('REMARK VINA RESULT:'):
            # Extract binding affinity
            parts = line.split()
            binding_affinity = float(parts[3])  # Binding affinity is the 4th item
        elif line.startswith('MODEL'):
            current_pose = []
        elif line.startswith('ENDMDL'):
            poses.append((binding_affinity, current_pose))
        else:
            current_pose.append(line)
    return poses

In [None]:
vina_output_file = '1iep_ligand_vina_out.pdbqt'
poses = parse_vina_output(vina_output_file)

for i, (binding_affinity, pose) in enumerate(poses):
    print(f"Pose {i + 1}: Binding Affinity = {binding_affinity} kcal/mol, Number of Atoms = {len(pose)}")

# Visualize output

In [None]:
def pdbqt_to_pdb(pdbqt_file, pdb_file):
    u = mda.Universe(pdbqt_file)
    u.atoms.write(pdb_file)

In [None]:
pdbqt_to_pdb('data/1iep_ligand_vina_out.pdbqt', 'data/1iep_ligand_vina_out.pdb')

In [None]:
pdbqt_to_pdb('data/angptl7_fixed.pdbqt', 'data/angptl7_fixed.pdb')

In [None]:
pdb_filename = 'data/molecule.pdb'  
fixer        = PDBFixer(filename=pdb_filename)

fixer.findMissingResidues()
fixer.findMissingAtoms()
fixer.addMissingAtoms()
fixer.addMissingHydrogens(ph)

PDBFile.writeFile(fixer.topology, fixer.positions, open('data/molecule_fixed.pdb'  , 'w'))

In [None]:
pdbqt_to_pdb('data/molecule_fixed.pdbqt', 'data/moleculed_fixed.pdb')

In [None]:
import py3Dmol


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import py3Dmol

def read_molecule_from_pdb(file_path):
    mol = Chem.MolFromPDBFile(file_path, removeHs=False)
    return mol

# Function to visualize the molecules
def visualize_docking_results(mol1, mol2):
    # Convert RDKit molecules to 3Dmol.js viewable format
    mb1 = Chem.MolToMolBlock(mol1)
    mb2 = Chem.MolToMolBlock(mol2)

    # Create a viewer object
    viewer = py3Dmol.view(width=800, height=400)

    # Add the first molecule
    viewer.addModel(mb1, "mol")
    viewer.setStyle({'stick': {}})

    # Add the second molecule with a different color
    viewer.addModel(mb2, "mol")
    viewer.setStyle({'model': 1}, {'stick': {'colorscheme': 'greenCarbon'}})

    # Update the view
    viewer.zoomTo()
    return viewer.show()

In [None]:
# File paths to the PDB files
pdb_file1 = 'data/1iep_ligand_vina_out.pdb'  
pdb_file2 = 'data/molecule_fixed.pdb' 

In [None]:
mol1 = read_molecule_from_pdb(pdb_file1)
print('Read molecule 1')

In [None]:
type(mol1)

In [None]:
mol2 = read_molecule_from_pdb(pdb_file2)
print('Read molecule 2')

In [None]:
# Visualize if both molecules are read successfully
if mol1 and mol2:
    visualize_docking_results(mol1, mol2)
else:
    print("Error: Unable to read one or both molecules.")