<a href="https://colab.research.google.com/github/spatiallysaying/SMAI/blob/master/COVID_19_RF_500.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()


add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
python version: 3.6.9
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit
done
rdkit-2020.03.1 installation finished!


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
import warnings
warnings.filterwarnings('ignore')
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression,RidgeCV,Ridge,LassoCV,ElasticNet

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
os.chdir('/content/drive/My Drive/drug')

In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mae = mean_absolute_error(y_test, prediction)
    mse = np.sqrt(mean_squared_error(y_test, prediction))
    
    plt.figure(figsize=(15, 10))
    plt.plot(prediction[:300], "red", label="prediction", linewidth=1.0)
    plt.plot(y_test[:300], 'green', label="actual", linewidth=1.0)
    plt.legend()
    plt.ylabel('Binding Affinity')
    plt.title("MAE {}, RMSE {}".format(round(mae, 4), round(mse, 4)))
    plt.show()
    
    print('MAE score:', round(mae, 4))
    print('RMSE score:', round(mse,4))

# **WORD2VEC**

In [0]:
!pip install git+https://github.com/samoturk/mol2vec;

In [0]:
mdf=pd.read_csv('train.csv')

In [0]:
mdf.head()

Unnamed: 0,SMILES sequence,Binding Affinity
0,CCNC(C)C(NC)c1ccccc1,-18.0861
1,CONC(=O)c1cncnc1,-17.5783
2,CCNC1CCCN(Cc2ccsc2)C1,-20.3645
3,CC(NC(=O)CSCCN)c1ccccc1,-19.3144
4,CCC(CS)CN(C)c1ccccc1,-15.8451


In [0]:
target = mdf['Binding Affinity']
mdf.drop(columns='Binding Affinity',inplace=True)

In [0]:
mdf['mol'] = mdf['SMILES sequence'] .apply(lambda x: Chem.MolFromSmiles(x))
mdf['fingerprint'] = mdf['mol'] .apply(lambda x: Chem.RDKFingerprint(x).ToBitString())

In [0]:
#Loading pre-trained model via word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

In [0]:
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
print('Molecular sentence:', mol2alt_sentence(mdf['mol'][1], radius=1))
print('\nMolSentence object:', MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)))
print('\nDfVec object:',DfVec(sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)), model, unseen='UNK')))

Molecular sentence: ['2246728737', '3975275337', '864674487', '903112553', '847961216', '2204949651', '2246699815', '1054767590', '864942730', '1510328189', '3217380708', '2994748777', '3218693969', '3777168895', '2041434490', '3118255683', '3218693969', '725322217', '2041434490', '3118255683', '3218693969', '3777168895']

MolSentence object: MolSentence with 22 words

DfVec object: (22, 100) dimensional vector


In [0]:
mdf.shape

(9000, 3)

In [0]:
#Constructing sentences
mdf['sentence'] = mdf.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

#Extracting embeddings to a numpy.array
#Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
mdf['mol2vec'] = [DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK')]
X = np.array([x.vec for x in mdf['mol2vec']])
y = target.values

X.shape

(9000, 100)

In [0]:
mdf.columns

Index(['SMILES sequence', 'mol', 'fingerprint', 'sentence', 'mol2vec'], dtype='object')

In [0]:
mdf.head()

Unnamed: 0,SMILES sequence,mol,fingerprint,sentence,mol2vec
0,CCNC(C)C(NC)c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",0100000010100000100000000001000000000000000000...,"(2246728737, 3542456614, 2245384272, 773607102...","(100,) dimensional vector"
1,CONC(=O)c1cncnc1,"<img data-content=""rdkit/molecule"" src=""data:i...",1100010000100000001000000000000000000000000000...,"(2246728737, 3975275337, 864674487, 903112553,...","(100,) dimensional vector"
2,CCNC1CCCN(Cc2ccsc2)C1,"<img data-content=""rdkit/molecule"" src=""data:i...",0100000000100000100000000001000001000001001100...,"(2246728737, 3542456614, 2245384272, 773607102...","(100,) dimensional vector"
3,CC(NC(=O)CSCCN)c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",0000000000100000001000000000000000000001001000...,"(2246728737, 3537119515, 2245273601, 146777865...","(100,) dimensional vector"
4,CCC(CS)CN(C)c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",1000010000100000000000010001000000001000000000...,"(2246728737, 3542456614, 2245384272, 150656359...","(100,) dimensional vector"


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

# **RidgeCV**

In [0]:

alphas = [0.1, 1, 10, 100, 1e3, 1e4, 2e4, 5e4, 8e4, 1e5, 1e6, 1e7, 1e8]
ridge = RidgeCV(alphas=alphas)
ridge.fit(X_train, y_train)

print('Training Accuracy')
evaluation(ridge, X_train, y_train)
print('\nTesting Accuracy')
evaluation(ridge, X_test, y_test)

Training Accuracy
MAE score: 1.8002
RMSE score: 2.4499

Testing Accuracy
MAE score: 1.8686
RMSE score: 2.5971


# **XGBRegressor**

In [0]:
from xgboost import XGBRegressor
XGBModel = XGBRegressor(n_estimators = 500,reg='squarederror', oob_score = True, n_jobs = -1,random_state =50,verbose=1)
XGBModel.fit(X, y)

print('Training Accuracy')
evaluation(XGBModel, X, y)
print('\nTesting Accuracy')
evaluation(XGBModel, X, y)

Training Accuracy
MAE score: 1.344
RMSE score: 1.7968

Testing Accuracy
MAE score: 1.344
RMSE score: 1.7968


# **RandomForestRegressor**

In [0]:
from sklearn.ensemble import RandomForestRegressor
#rf = RandomForestRegressor()
rf=RandomForestRegressor(n_estimators = 500, oob_score = True, n_jobs = -1,random_state =50,verbose=1)
rf.fit(X, y)
'''
print('Training Accuracy')
evaluation(rf, X_train, y_train)
print('\nTesting Accuracy')
evaluation(rf, X_test, y_test)
'''
print('Accuracy')
evaluation(rf, X, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.9min finished


Accuracy


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s


MAE score: 0.6943
RMSE score: 0.947


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.7s finished
