<a href="https://colab.research.google.com/github/sachaRfd/Drug-Likeness-Prediction/blob/main/Druglikeness_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Machine learning Predictions for drug likeness score:

Database: 
- Data includes SMILES - logP (water-octanal partition coefficient) - Drug-Likeness
- from https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv

Classic Imports: 

In [1]:
import torch
import torch.nn as nn
import string
import random
import sys
import os
import pandas as pd
import numpy as np

!pip install unidecode
import unidecode
from torch.utils.tensorboard import SummaryWriter

!pip install rdkit -q

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


Seed-setting function:

In [2]:

def set_seed(seed):
    """
    Use this to set ALL the random seeds to a fixed value and take out any randomness from cuda kernels
    """

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

Data was downloaded from: 


In [9]:
data = pd.read_csv('250k_rndm_zinc_drugs.csv')
data

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.05060,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.11370,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182
...,...,...,...,...
249450,CC1(C)CC[C@H](CNC(=O)Cn2ncc3ccccc3c2=O)c2ccccc...,3.36790,0.745901,2.900726
249451,Cn1ccnc1C(=O)c1ccc(NC(=O)C2CCN(C(=O)C(C)(C)C)C...,2.87430,0.799426,2.326627
249452,Cc1ccc(NC(=O)C(=O)N(C)Cc2ccccc2)c(C)c1\n,2.90054,0.878086,1.840642
249453,Cc1cc(C(=O)Nc2ccc(OCC(N)=O)cc2)c(C)n1C1CC1\n,2.55624,0.852917,2.024638


Let's check for duplicates: 

In [22]:
data.duplicated().sum()

0

Let's check for missing values: 


In [25]:
data.isna().sum()

smiles    0
logP      0
qed       0
SAS       0
dtype: int64

In [27]:
data.isnull().sum()

smiles    0
logP      0
qed       0
SAS       0
dtype: int64

Now let's add Molecular weights: 

In [114]:
from rdkit.Chem import Descriptors

# Calculated MW - total molecular weight
# Descriptors.MolWt(x)
test = data.smiles.iloc[0]
mol = Chem.MolFromSmiles(test)

Descriptors.MolWt(mol)

325.38300000000004

Number of atoms: 

In [115]:
mol.GetNumAtoms()

24

Now let's add Atomic Composition: 

In [117]:
fp = Chem.RDKFingerprint(mol)

# Convert the fingerprint to a bit string for visualisation: 
bit_string = fp.ToBitString()

# Print the bit string
print(bit_string)

0101010010000000111000001100110111101100100001000010000111110100100001001000110001110100000000001011100101000100100000011010100000001000100100101100001000000000011000100000010101011101100010001010011100000100100000000001100110111100100100000000010000101101001001111000011000101010111100000000001000000111001010000001001110110000001110100011000010100011000001000110001110000000011001000000000101000000010100011001000100000011000010101010101101101000010011000000100011100111000001010000000000100000001001110011111001000001110010000100101110001010111000101101100010001000110101000001000000000000001000001001101100010000001100110001000001001011010000011001101111100110010100111000001001000000000001010001011010000100011111010001000100001100001100001000001011001000000100010010000100010101111000001000001101100100000010010111000011100110100000000110000010000011111000000000100101111001000100101000010010100001011000010011001101000000001001001010100100001011110001001000011010001000010000000110000101010001

Let's try and add the fingerprint to the dataframe by iterating all the strings: 

In [128]:
SMILES = data.smiles.values
fingerprints = [Chem.RDKFingerprint(Chem.MolFromSmiles(i)) for i in SMILES]
data['Finger_print'] = fingerprints
data

In [153]:
bits = [float(i.ToBitString()) for i in data.Finger_print]
data['BITS'] = bits

In [154]:
data

Unnamed: 0,smiles,logP,qed,SAS,Finger_print,BITS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.05060,0.702012,2.084095,"[0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",inf
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.11370,0.928975,3.432004,"[1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...",inf
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, ...",inf
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",inf
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182,"[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",inf
...,...,...,...,...,...,...
249450,CC1(C)CC[C@H](CNC(=O)Cn2ncc3ccccc3c2=O)c2ccccc...,3.36790,0.745901,2.900726,"[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, ...",inf
249451,Cn1ccnc1C(=O)c1ccc(NC(=O)C2CCN(C(=O)C(C)(C)C)C...,2.87430,0.799426,2.326627,"[1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, ...",inf
249452,Cc1ccc(NC(=O)C(=O)N(C)Cc2ccccc2)c(C)c1\n,2.90054,0.878086,1.840642,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",inf
249453,Cc1cc(C(=O)Nc2ccc(OCC(N)=O)cc2)c(C)n1C1CC1\n,2.55624,0.852917,2.024638,"[1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, ...",inf


Great now that we are sure there are no duplicates and missing values, let's start doing some data-analysis and ML.


# Druglikeness Prediction: 

## In this part of the notebook:  Let's predict the druglikeness of chemical compounds. 

Our x variable will be the SMILES string and Y will be the SAS score: 
  

In [144]:
x = pd.DataFrame(data.BITS)
y = pd.DataFrame(data.SAS)

Let's split the data into train-test set with 80-20 split as data is quite large: 

In [145]:
from sklearn.model_selection import train_test_split

x_train, x_test, Y_train, Y_test = train_test_split(x, y, test_size=0.1, shuffle = True, random_state=42)

Let's now split the train set into train and validation sets: 

In [146]:
x_train, x_val, Y_train, Y_val = train_test_split(x_train, Y_train, test_size=0.2, shuffle = True, random_state = 42)

In [147]:
x_train.iloc[0].values[0]

'110001000000010010011010000111001110010000110100011100110011000111011100001011000011010010101100100001111100010110010010011100000000000000110110000111111101000011010010101101100101110001111000011001000010110011100100000101010001111011010100011010000111111011110001010010000001100100010100000000111010001111001011110000010000000101110010100110000000101010001000010000000011100011000101001100100011010010001000011010000001100100000100100011001110111011001001000000111110010000110011011010100011111000011000001110111010000000100111110001100001000100101010100110001010000011000100000100001101000101000100000110010110110010000000001111001100011111001001110101001000010010111110001110100101100110100111010110100001010110010011001100010010010101010111000000101011001011011011101010110010011010100010100010100111010110111000000100111010000010000110111000110011010010101000111100110001110101001101101001100010100101111111100100100011110100110000101100000100000001110000111010100010000101010011011100110000010

In [148]:
print(f'The following molecule has a Druglikeness score of {Y_train.iloc[0].values[0]}')
Chem.MolFromSmiles(x_train.iloc[0].values[0])

The following molecule has a Druglikeness score of 2.865524868075328


In [150]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

LR_model = LinearRegression()

cv = cross_validate(LR_model , x_train, Y_train, cv = 2, n_jobs=-1)


2 fits failed out of a total of 2.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py", line 662, in fit
    X, y = self._validate_data(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py", line 964, in check_X_y
    X = check_array(
  File "/usr/local/lib/python3.8/dist-