In [1]:
"""
This module was taken from target-pred-py https://github.com/ikmckenz/target-pred-py/blob/master/src/data/chembl_etl.py

This module will download the SQLite version of the ChEMBL database if it
doesn't exist in data/, and use it to create the interim data sets.
"""

"\nThis module was taken from target-pred-py https://github.com/ikmckenz/target-pred-py/blob/master/src/data/chembl_etl.py\n\nThis module will download the SQLite version of the ChEMBL database if it\ndoesn't exist in data/, and use it to create the interim data sets.\n"

In [2]:
import csv
import os
import sqlite3
import tarfile
import urllib.request

In [3]:
OLD_QUERY = """
            SELECT 
                canonical_smiles, published_value, published_units, pref_name
            FROM 
                compound_structures as cs, activities as ac, assays as assays, target_dictionary as td
            WHERE 
                cs.molregno = ac.molregno 
            AND
                /* Endpoint is Ki*/
                ac.bao_endpoint = "BAO_0000192"
            AND
                ac.published_value IS NOT NULL
            AND
                ac.assay_id = assays.assay_id
            AND
                assays.tid = td.tid
            AND
                td.pref_name != "Unchecked"
            AND
                td.organism = "Homo sapiens"
            """

SWISS_QUERY = """
                SELECT
                    canonical_smiles, standard_value, standard_units, pref_name
                FROM
                    compound_structures as cs, target_dictionary as td, assays, activities, compound_properties
                WHERE
                    compound_properties.molregno = cs.molregno
                AND
                    cs.molregno = activities.molregno
                AND
                    activities.assay_id = assays.assay_id
                AND 
                    assays.tid = td.tid
                AND
                    assays.assay_organism = "Homo sapiens"
                AND
                    assays.assay_type = "B"
                AND
                    activities.type IN ("Ki","Kd","IC50","EC50")
                AND
                    activities.standard_units IN ("mM", "uM","nM")
                AND
                    activities.standard_relation in ("=", "<", "<=")
                AND
                    compound_properties.heavy_atoms < 80
                AND
                    td.target_type IN ("SINGLE PROTEIN", "PROTEIN COMPLEX")
              """

CYP_ASSAY_QUERY = """
                SELECT
                    canonical_smiles, standard_value, standard_units, pref_name
                FROM
                    compound_structures as cs, target_dictionary as td, assays, activities, compound_properties
                WHERE
                    compound_properties.molregno = cs.molregno
                AND
                    cs.molregno = activities.molregno
                AND
                    activities.assay_id = assays.assay_id
                AND 
                    assays.tid = td.tid
                AND
                    assays.assay_organism = "Homo sapiens"
                AND
                    assays.assay_type = "A"
                AND
                    activities.type IN ("Ki","Kd","IC50","EC50")
                AND
                    activities.standard_units IN ("mM", "uM","nM")
                AND
                    activities.standard_relation in ("=", "<", "<=")
                AND
                    compound_properties.heavy_atoms < 80
                AND
                    td.target_type IN ("SINGLE PROTEIN", "PROTEIN COMPLEX")
              """

In [4]:
class ChEMBL_SQLite:  # pylint: disable=invalid-name
    """ChEMBL data http://www.ebi.ac.uk/chembl version chembl_24_1.

    Args:
        path (string, optional): Where the data will be downloaded. Defaults to
            the `data` directory of code.
    """
    url = "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_25/"
    filename = "chembl_25_sqlite.tar.gz"
    dbpath = "chembl_25/chembl_25_sqlite/chembl_25.db"
    csvfilename = "../interim/smiles_to_activity.csv"

    def __init__(self, path="../data/external/"):
        self.path = path

    def get_raw_data(self):
        """Will create the raw data if it does not already exist."""
        if not os.path.isfile(self.path + self.csvfilename):
            self._write_raw_data(query=SWISS_QUERY)

    def db_connect(self):
        """Returns a connection to the ChEMBL database,
        will download if it does not exist.

        Returns:
            conn (sqlite3.Connection): The connection to the database
        """
        if not os.path.isfile(self.path + self.dbpath):
            self._download()

        conn = sqlite3.connect(self.path + self.dbpath)
        return conn

    def _write_raw_data(self, query):
        """This runs the query to get our data from the database
        For now this query returns good-enough data to do a quick analysis,
        this is not the final query.
        """

        conn = self.db_connect()
        print("Running SQL query")
        cur = conn.execute(query)
        headers = [x[0] for x in cur.description]
        data_table = cur.fetchall()

        with open(self.path + self.csvfilename, "w") as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(data_table)

    def _download(self):
        """Downloads the ChEMBL database if it doesn't exist"""
        if not os.path.isfile(self.path + self.dbpath):
            delete_tar = False

            if not os.path.isfile(self.path + self.filename):
                delete_tar = True
                print("Downloading ChEMBL database")
                urllib.request.urlretrieve(self.url + self.filename,
                                           self.path + self.filename)

            print("Extracting tarfile")
            tarfile.open(self.path + self.filename).extractall(path=self.path)
            if delete_tar:
                os.remove(self.path + self.filename)

In [5]:
cs = ChEMBL_SQLite()

In [6]:
conn = cs.db_connect()

In [7]:
# dbconn = cs.db_connect('data/external/chembl_25/chembl_25_sqlite/chembl_25.db')

TypeError: db_connect() takes 1 positional argument but 2 were given

In [8]:
c = conn.cursor()

In [9]:
query = c.execute(SWISS_QUERY)

In [10]:
print(c.fetchone())

('COc1cc(N)c(Cl)cc1C(=O)OCCN2CCN(CC2)c3ncccn3', 4.5, 'nM', 'Serotonin 4 (5-HT4) receptor')


In [11]:
import pandas as pd

In [12]:
SQL_Query = pd.read_sql_query(SWISS_QUERY, conn)

In [13]:
df = pd.DataFrame(SQL_Query)

In [16]:
df.head(30)

Unnamed: 0,canonical_smiles,standard_value,standard_units,pref_name
0,COc1cc(N)c(Cl)cc1C(=O)OCCN2CCN(CC2)c3ncccn3,4.5,nM,Serotonin 4 (5-HT4) receptor
1,COc1cc(N)c(Cl)cc1C(=O)OCCN2CCN(CC2)c3ncccn3,66.0,nM,Serotonin 4 (5-HT4) receptor
2,O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12,26000.0,nM,Neuronal acetylcholine receptor; alpha3/beta4
3,O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12,100000.0,nM,Acetylcholine receptor protein delta chain
4,CNCC\C=C\c1cccnc1,46000.0,nM,Neuronal acetylcholine receptor; alpha3/beta4
5,CN1CCC[C@H]1COc2ccc(Cl)nc2,1400.0,nM,Neuronal acetylcholine receptor; alpha4/beta2
6,CCc1cncc(OC[C@@H]2CCCN2C)c1,1000.0,nM,Neuronal acetylcholine receptor; alpha4/beta2
7,Clc1ccc(cn1)C2CC3CCC2N3,7.0,nM,Neuronal acetylcholine receptor; alpha3/beta4
8,Clc1ccc(cn1)C2CC3CCC2N3,200.0,nM,Acetylcholine receptor protein delta chain
9,C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@H]2[C@@H]3[C...,3000.0,nM,Bile acid receptor FXR


In [15]:
df.shape

(544894, 4)

In [None]:
# str_choice = "cyp450|CYP450|cyp"

In [19]:
cyp_df = df[df['pref_name'].str.contains('cytochrome', na=False)]
cyp_df.info

<bound method DataFrame.info of Empty DataFrame
Columns: [canonical_smiles, standard_value, standard_units, pref_name]
Index: []>

In [None]:
CYP_ASSAY_Query = pd.read_sql_query(CYP_ASSAY_QUERY, conn)

In [None]:
cyp_df = pd.DataFrame(CYP_ASSAY_QUERY)

In [None]:
# cypquery = df['pref_name']