# Lab 3

## Setup for this lab

1. Download the data from the following link: https://smu.box.com/s/smqmwlef0yehpieicwxqdr99k7f9ru04
2. Extract the downloaded data into Lab3/data
3. Run the `query.py` script in the data folder
4. Install RDKit for Python: https://www.rdkit.org/docs/Install.html

NOTE: I found it significantly easier to download RDKit using the apt package for Ubuntu. However, this installed the Python2 version of RDKit instead of the Python3 version. If you are using Python3, the only difference is reading in the data using pickle.

In [16]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import MACCSkeys, AllChem
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import RDConfig
import os
import pickle
from random import shuffle

import keras
from keras.layers import Input, Dense
from keras.models import Model


In [2]:
X = pickle.load(open('data/data.pkl','rb'))

In [3]:
df = pd.DataFrame.from_dict(X).head(200)

In [4]:
# remove values in the middle
df = df[(df.ic50.astype(float) > 10000) | (df.ic50.astype(float) < 300)]

# binarize ic50
df.ic50 = df.ic50.astype(float) < 10000

In [5]:
def get_morgan_fingerprints(smiles):
    fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
    fps = []
    for smile in smiles:
        m = Chem.MolFromSmiles(smile)
        if m is not None:
            fps.append(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512))
        else:
            fps.append(None)
    return fps

def get_topological_fingerprints(smiles):
    fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
    fps = []
    for smile in smiles:
        m = Chem.MolFromSmiles(smile)
        if m is not None:
            fps.append(FingerprintMols.FingerprintMol(m))
        else:
            fps.append(None)
    return fps

In [6]:
# build feature factory
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)

#fps = []
#all_smiles = df.smiles
df['morgan_fps'] = get_morgan_fingerprints(df.smiles)
df['topological_fps'] = get_topological_fingerprints(df.smiles)
df = df[df.morgan_fps != None]
df = df.drop(columns=['smiles'])

In [7]:
uq_targets = set(df.target)
target_data = {}
for target in uq_targets:
    target_data[target] = [[],[],[]]

for index, row in df.iterrows():
    morgan = np.zeros((1,))
    topo = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(row.morgan_fps, morgan)
    DataStructs.ConvertToNumpyArray(row.topological_fps, topo)
    target_data[row.target][0].append(morgan)
    target_data[row.target][1].append(topo)
    target_data[row.target][2].append(row.ic50)

for target in target_data.keys():
    target_data[target][0] = np.array(target_data[target][0])
    target_data[target][1] = np.array(target_data[target][1])
    target_data[target][2] = np.array(target_data[target][2])*1

The data is now saved in a dictionary called `target_data`. This dictionary contains one entry per target. For each target, there is a list that contains three values: a numpy array with the morgan fingerprints, a numpy array with the topological fingerprints, and a numpy array with the binary binding affinity.

In [20]:
def build_models(input_shape, target_names):
    model_input = Input(input_shape)
    shared_layers = Dense(512, activation='relu')(model_input)
    shared_layers = Dense(1024, activation='relu')(shared_layers)
    shared_layers = Dense(1024, activation='relu')(shared_layers)
    models = {}
    for target_name in target_names:
        specialized_layers = Dense(2048, activation='relu')(shared_layers)
        output = Dense(1, activation='sigmoid')(specialized_layers)
        models[target_name] = Model(model_input,output)
        models[target_name].compile(loss='binary_crossentropy',optimizer='adam')
    return models

def train_models(target_data, models, epochs, batch_size, data_type='morgan'):
    if data_type=='morgan':
        data_index = 0
    else:
        data_index = 1
    for epoch in range(epochs):
        target_order = models.keys()
        shuffle(target_order)
        for target in target_order:
            model = models[target]
            x_data = target_data[target][data_index]
            y_data = target_data[target][2]
            model.fit(x_data, y_data, batch_size=batch_size, epochs=1, verbose=0)

In [None]:
#target_data[target_data.keys()[0]][0].shape

models = build_models((512,), target_data.keys())

train_models(target_data, models, 1, 2, 'morgan')