In [1]:
#!/usr/bin/python

# Standard library.
import urllib2

# External library.
import pandas as pd
import rdkit.Chem as Chem
# from rdkit.ML.Descriptors import MoleculeDescriptors as md
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
def smi_to_iupac(smi):
    '''
    To fetch IUPAC name given a SMILES.
    '''
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/'+smi+'/iupac_name'
        iupacName = urllib2.urlopen(url).read()
#         return iupacName
        if "html" in iupacName:
            return None
        else:
            return iupacName

    except urllib2.HTTPError, e:
        print "HTTP error: %d" % e.code
        return None
    except urllib2.URLError, e:
        print "Network error: %s" % e.reason.args[1]
        return None
    except:
        print "Conversion failed for smiles "+ smi
        return None

In [3]:
# Load data.
bitter_train = pd.read_csv('bitter-train.tsv', sep='\t')
bitter_test = pd.read_csv('bitter-test.tsv', sep='\t')
sweet_train = pd.read_csv('sweet-train.tsv', sep='\t')
sweet_test = pd.read_csv('sweet-test.tsv', sep='\t')

df_bitter = pd.DataFrame()
df_sweet = pd.DataFrame()

df_bitter = pd.concat([bitter_train[['Name','SMILES', 'Bitter']], bitter_test[['Name','SMILES', 'Bitter']]])
df_sweet = pd.concat([sweet_train[['Name','SMILES', 'Sweet']], sweet_test[['Name','SMILES', 'Sweet']] ])

df_bitter.rename(columns={'Bitter':'Target'}, inplace=True)
df_sweet.rename(columns={'Sweet':'Target'}, inplace=True)

df_bitter.Target.replace(to_replace=True, value='Bitter', inplace=True)
df_bitter.Target.replace(to_replace=False, value='Sweet', inplace=True)
df_sweet.Target.replace(to_replace=True, value='Sweet', inplace=True)
df_sweet.Target.replace(to_replace=False, value='Bitter', inplace=True)

df = pd.DataFrame()
df = pd.concat([df_bitter, df_sweet])

In [4]:
# length = df.shape[0]

# count = 0

# iupac_name = []
# name = []
# smiles = []
# canon_smiles = []
# target = []
# for i in range(length):
#     try:
#         c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
#         canon_smiles.append(c)
#         smiles.append(str(df.iloc[i]['SMILES']))
#         target.append(str(df.iloc[i]['Target']))
#         name.append(str(df.iloc[i]['Name']))
#         iupac_name.append(smi_to_iupac(c))
#     except:
#         count += 1
#         continue

# dict_canon_smiles = {'Name':name,'IUPAC Name':iupac_name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Target':target}
# df_canon_smiles = pd.DataFrame(data=dict_canon_smiles)


# df_canon_smiles.to_pickle('df_canon_smiles.pkl')

In [5]:
df_can = pd.read_pickle('df_canon_smiles.pkl')

In [6]:
# Calculate descriptors.
desc_list = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)
df_desc = pd.DataFrame(columns=desc_list, index=df_can.index)
for i in df_can.index:
    smi = df_can.loc[i]['Canonical SMILES']
    mol = Chem.MolFromSmiles(smi)
    ds = calc.CalcDescriptors(mol)
    df_desc.loc[i]=list(ds)

In [7]:
df_desc.to_pickle('df_desc.pkl')

In [8]:
df_final = df_can[['Name', 'SMILES', 'Canonical SMILES']]

In [9]:
df_save = pd.concat([df_final, df_desc, df_can[['Target']]], axis=1)

In [10]:
df_save.to_csv('bitter_sweet.tsv', sep='\t')