### Joshua Silva

In [2]:
from chembl_webresource_client.new_client import new_client
import csv
import sqlite3

## Getting data through api, determining potential features, and saving initial data to SQLite DB

In [3]:
# Making a call to the api for a structure by smiles string,'C[N+](C)(C)CCOC(N)=O'
# originally was going to use smiles from the ZINC DB but a simpler solution was found
r = new_client.molecule.filter(molecule_structures__canonical_smiles__flexmatch='C[N+](C)(C)CCOC(N)=O')

In [4]:
r[0]
# important fields
# molecule_chembl_id
# molecule_properties
# indication_class -> more indications exist than are listed from this call, from looking at website
# chirality
# Others will not be predictive for the models being used

{'atc_classifications': [],
 'availability_type': '1',
 'biotherapeutic': None,
 'chebi_par_id': None,
 'chirality': '2',
 'cross_references': [{'xref_id': 'carbachol',
   'xref_name': 'carbachol',
   'xref_src': 'DailyMed'},
  {'xref_id': '11110943', 'xref_name': 'SID: 11110943', 'xref_src': 'PubChem'},
  {'xref_id': '11110944', 'xref_name': 'SID: 11110944', 'xref_src': 'PubChem'},
  {'xref_id': '26756576', 'xref_name': 'SID: 26756576', 'xref_src': 'PubChem'},
  {'xref_id': '90341139',
   'xref_name': 'SID: 90341139',
   'xref_src': 'PubChem'}],
 'dosed_ingredient': False,
 'first_approval': 1972,
 'first_in_class': '0',
 'helm_notation': None,
 'indication_class': 'Cholinergic (ophthalmic)',
 'inorganic_flag': '0',
 'max_phase': 4,
 'molecule_chembl_id': 'CHEMBL965',
 'molecule_hierarchy': {'molecule_chembl_id': 'CHEMBL965',
  'parent_chembl_id': 'CHEMBL965'},
 'molecule_properties': {'acd_logd': '-4.1',
  'acd_logp': '-4.1',
  'acd_most_apka': '12.54',
  'acd_most_bpka': None,
  'al

In [71]:
# Able to pull drug id's by indication, this may be the only way to link together all indications
# Indication class from the molecule search is not the same as the indication here
drug_indication = new_client.drug_indication
lung_cancer_ind = drug_indication.filter(efo_term__icontains="LUNG CARCINOMA").only(
    ['molecule_chembl_id'])

In [2]:
# Pulling the important fields for the drug id's from the indication, which are approved
molecules = new_client.molecule
lung_cancer_mols = molecules.filter(
    molecule_chembl_id__in=[x['molecule_chembl_id'] for x in lung_cancer_ind], max_phase=4).only(
    ['molecule_chembl_id', 'molecule_properties', 'indication_class', 'chirality', 'molecule_structures'])

NameError: name 'lung_cancer_ind' is not defined

In [3]:
# Pulling only approved drugs, max phase = 4 this may be sufficient, getting the data cared about
# A significant portion of the data had no smiles string so this was not ultimately used
approved_mols = molecules.filter(max_phase=4).only(
    ['molecule_chembl_id', 'molecule_properties','indication_class', 'chirality', 'molecule_structures'])

In [4]:
# Pulling out the indications which we want to predict
indications = [x['indication_class'] for x in approved_mols]

In [102]:
# Saving the approved drugs in case of api issues
with open('approved.txt', 'w') as f:
    for item in approved_mols:
        f.write("%s\n" % item)

In [99]:
# All 585 Indications
with open('indications.csv', 'w') as f:
    for item in set(indications):
        f.write("%s\n" % item)

{'Acidifier',
 'Acidifier (urlnary); Vitamin (antiscorbutic)',
 'Acidifier; Diuretic',
 'Adrenergic',
 'Adrenergic (alpha12-agonist)',
 'Adrenergic (ophthalmic)',
 'Adrenergic (ophthalmic),Adrenergic (vasoconstrictor)',
 'Adrenergic (ophthalmic),Antiglaucoma Agent',
 'Adrenergic (vasoconstrictor)',
 'Adrenergic (vasoconstrictor),Decongestant,Nasal Decongestant',
 'Adrenergic,Bronchodilator',
 'Adrenocortical Steroid (salt-regulating)',
 'Adrenocortical Steroid,Glucocorticoid',
 'Adrenocortical Suppressant',
 'Adrenocortical Suppressant; Antineoplastic',
 'Advanced Colorectal Cancer Treatment (thymidylate synthase inhibitor)',
 'Alcohol Deterrent',
 'Aldosterone Antagonist',
 'Alkalizer',
 'Alkalizer (systemic)',
 'Alkalizer (systemic); Replenisher (electrolyte)',
 "Alzheimer's Disease Treatment (adjunct)",
 'Amino Acid',
 'Amino Acid,Radioactive Agent',
 'Ammonia Detoxicant',
 'Amyotrophic Lateral Sclerosis Treatment',
 'Anabolic',
 'Analeptic (treatment of narcolepsy hypersomnia)',
 '

In [10]:
# Starting the DB
conn = sqlite3.connect('project.db')
c = conn.cursor()

In [57]:
# Creating the tables
c.execute('''CREATE TABLE IF NOT EXISTS molecule
             (chirality INT, id TEXT  PRIMARY KEY, logd REAL, logp REAL, most_apka REAL,
             most_bpka REAL, alogp REAL, aromatic_rings INT, formula TEXT, mw REAL, hba INT,
             hbd INT, heavy_atoms INT, molecular_species TEXT, qed_weighted REAL)''')

c.execute('''CREATE TABLE IF NOT EXISTS indication
             (id text, indication text, FOREIGN KEY (id) REFERENCES molecule(id))''')
conn.commit()

In [53]:
# Reshaping the up the property data so that it can be put in a DB
properties = ['acd_logd', 'acd_logp', 'acd_most_apka', 'acd_most_bpka', 'alogp', 'aromatic_rings',
'full_molformula', 'full_mwt', 'hba', 'hbd', 'heavy_atoms', 'molecular_species', 'qed_weighted']
mol_data = list()
for e in approved_mols:
    if  e['molecule_properties'] is None: 
        # Some of the data has no Molecular Properties, imputing this would give the average case
        # It is not useful data to keep
        continue
    mol_props = list()
    for p in properties:
        mol_props.append(e['molecule_properties'][p])
    
    entry = [e['chirality'], e['molecule_chembl_id']]
    entry = entry + mol_props
    mol_data.append(entry)

[['2',
  'CHEMBL2',
  '2.08',
  '2.14',
  None,
  '6.52',
  '1.78',
  3,
  'C19H21N5O4',
  '383.41',
  8,
  1,
  28,
  'NEUTRAL',
  '0.73'],
 ['1',
  'CHEMBL3',
  '-0.62',
  '0.57',
  None,
  '8',
  '1.85',
  1,
  'C10H14N2',
  '162.24',
  2,
  0,
  12,
  'NEUTRAL',
  '0.63'],
 ['0',
  'CHEMBL4',
  '-0.39',
  '1.85',
  '5.19',
  '7.37',
  '1.54',
  2,
  'C18H20FN3O4',
  '361.37',
  6,
  1,
  26,
  'ACID',
  '0.87'],
 ['2',
  'CHEMBL5',
  '-1.54',
  '0.03',
  '3.45',
  '6.12',
  '1.42',
  2,
  'C12H12N2O3',
  '232.24',
  4,
  1,
  17,
  'ACID',
  '0.85'],
 ['2',
  'CHEMBL6',
  '0.98',
  '4.25',
  '3.96',
  None,
  '3.93',
  3,
  'C19H16ClNO4',
  '357.79',
  4,
  1,
  25,
  'ACID',
  '0.77'],
 ['1',
  'CHEMBL403',
  '-3.33',
  '0.39',
  '2.62',
  None,
  '-0.79',
  0,
  'C8H11NO5S',
  '233.24',
  4,
  1,
  15,
  'ACID',
  '0.6'],
 ['1',
  'CHEMBL404',
  '-3.13',
  '0.6',
  '2.33',
  '0.85',
  '-1.52',
  1,
  'C10H12N4O5S',
  '300.3',
  7,
  1,
  20,
  'ACID',
  '0.67'],
 ['2',
  'CHEMBL8

In [61]:
# Add the data into the DB
c.executemany('INSERT INTO molecule VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', mol_data)
conn.commit()

In [16]:
# Clean the indication data so entries can have multiple indications, and reshape for insertion into DB
ind_data = list()
for e in approved_mols:
    if e['indication_class'] is None:
        continue
    for i in e['indication_class'].replace(',', ';').split(';'):
            ind_data.append([e['molecule_chembl_id'], i])

In [17]:
c.executemany('INSERT INTO indication VALUES (?, ?)', ind_data)
conn.commit()

In [139]:
conn.close()