In [None]:
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.graph_objects as go

import torch
from torch_geometric.data import Data

edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
print(data)

Data(x=[3, 1], edge_index=[2, 4])


In [3]:
import os

print("current directory is" , os.getcwd(), "\n")
print("updated current workng directory with renamed directory", os.listdir(), "\n")

current directory is /work 

updated current workng directory with renamed directory ['.deepnote', '3d_molecule'] 



# Scrapping with Beautiful Soup

In [None]:
url = 'https://www.aatbio.com/data-sets/solubility-reference-table'
page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')
table = soup.find_all('table', class_='base_cds_standard__c5usT')[0]

def col_headers(table):
    headers = []
    for i in table.find_all('span', class_='base_cds_text__jizk7'):
        title = i.text
        headers.append(title)
    return headers
        
col_names = col_headers(table)
mydata = pd.DataFrame(columns=col_names)

table_data = table.find('tbody')

def table_values(table,df):
    for j in table.find_all('tr'):
        row_data = j.find_all('td')
        row = [i.text for i in row_data]
        length = len(df)
        df.loc[length] = row
    return df    
df = table_values(table_data, mydata)
df

Unnamed: 0,Compound Name,Formula,Water Solubility (mg/L),Water Solubility (g/L)
0,(+)(r)-6-Hydroxy-4-Methyl-Caproaldehyde,C7H14O2,11000,11
1,(+)-Fenchol,C10H18O,828,0.828
2,(+)-Perseitol,C7H16O7,64600,64.6
3,(-)-Menthone,C10H18O,497,0.497
4,"(1r,2s)-(-)-Ephedrine",C10H15NO,56900,56.9
...,...,...,...,...
6630,"[1,2,4,5]tetrazino[1,2-A][1,2,4,5]tetrazine, Oct",C4H12N6,140000,140
6631,"[1,2,5]selenadiazolo[3,4-D]pyrimidine-5,7(4h,6h)",C6H6N4O2Se,1230,1.23
6632,"[3,3]db18c6-Dibenzo Crown Ether",C20H24O6,7.3,0.0073
6633,[4-(2-Ethoxyethylamino)phenyl]-(4-Aminophenyl)-s,C16H20N2O3S,100,0.1


# Functional API Imports

In [2]:
import requests
import pandas as pd
import json
import plotly.express as px
import plotly.graph_objects as go

class Chem_API():

    elements = pd.read_csv('3d_molecule/Periodic Table.csv').set_index('Atomic_Number')
    elements_dict = elements.to_dict()

    atom_names = elements_dict['Name']
    atom_weights = elements_dict['Atomic_Mass']

    def __init__(self, name, data_type):
        self.name = name    
        self.data_type = data_type

    # Searches PUG API for compound and return JSON file
    def get_conformers(self):
        if self.data_type == 'SMILES':
            response = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{self.name}/conformers/json')
        else:
            response = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{self.name}/conformers/json')
        
        json_conformers = response.json()
        conformer_id = json_conformers['InformationList']['Information'][0]['ConformerID'][0]
        CID = json_conformers['InformationList']['Information'][0]['CID']
        return conformer_id, CID
    
    # Grabs conformer_id number from JSON and searches again to get 3d version in JSON
    def get_atoms_json(self):
        conformer_id = self.get_conformers()[0]
        response = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/conformers/{conformer_id}/json')
        json_file_3d = response.json()
        return json_file_3d
    
    # Create DataFrame with all these specs
    def atomic_df(self):
        json_file_3d = self.get_atoms_json()
        coordinates = json_file_3d['PC_Compounds'][0]['coords'][0]

        # Create DataFrame with all these specs
        compound_atoms = pd.DataFrame(columns=['ID', 'Atom', 'Size','X', 'Y', 'Z'])

        compound_atoms['ID'] = coordinates['aid']
        compound_atoms['Atom'] = json_file_3d['PC_Compounds'][0]['atoms']['element']
        compound_atoms['Size'] = compound_atoms['Atom'].replace(self.atom_weights)
        compound_atoms['Atom'] = compound_atoms['Atom'].replace(self.atom_names)
        compound_atoms['X'] = coordinates['conformers'][0]['x']
        compound_atoms['Y'] = coordinates['conformers'][0]['y']
        compound_atoms['Z'] = coordinates['conformers'][0]['z']

        compound_atoms = compound_atoms.set_index('ID')

        return compound_atoms

    # Grabs CID from conformer file and pulls up additional information such as SMILES
    def get_SMILES(self):
        CID = self.get_conformers()[1]
        response = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{CID}/json')
        more_info = response.json()
        compound_SMILES = more_info['PC_Compounds'][0]['props'][18]['value']['sval']
        return compound_SMILES

    # Get coordinates for one axis
    def bond_coordinates(self, axis):
        json_file_3d = self.get_atoms_json()
        compound = self.atomic_df()

        bond_pairs = json_file_3d['PC_Compounds'][0]['bonds']
        df = pd.DataFrame({'bond1': bond_pairs['aid1'], 'bond2':bond_pairs['aid2']})
        bond_pairs_df = df.copy()

        df['coord1'] = df.bond1.map(compound[axis])
        df['coord2'] = df.bond2.map(compound[axis])

        coord_list = [[df.coord1[i], df.coord2[i], None] for i in range(df.shape[0])]
        coord_list = [item for sublist in coord_list for item in sublist]

        return coord_list, bond_pairs_df
    
    # Combine all axes coordinates into dataframe
    def bonds_coord_combined(self):
        X_list = self.bond_coordinates('X')[0]
        Y_list = self.bond_coordinates('Y')[0]
        Z_list = self.bond_coordinates('Z')[0]

        bonds = pd.DataFrame({'X_coords': X_list, 'Y_coords': Y_list, 'Z_coords': Z_list})

        return bonds
    
    # 3D Graph of molecule with a tight layout
    def plot_3d(self):
        compound = self.atomic_df()
        bonds = self.bonds_coord_combined()

        trace1 = px.scatter_3d(compound, x='X', y='Y', z='Z',
                            color='Atom', 
                            size='Size', 
                            size_max=70, 
                            opacity=1)

        trace2 = px.line_3d(bonds, x='X_coords', y='Y_coords', z='Z_coords', 
                            color_discrete_sequence=['black'])
        trace2.update_traces(hovertemplate = None, hoverinfo = "skip", line=dict(width=10))

        fig=go.Figure(data=trace1.data + trace2.data)

        fig.update_layout(margin=dict(l=0, r=40, b=0, t=30),
                        title=f'{self.name.capitalize()} 3D Plot')
        fig.show()

In [4]:
compound = Chem_API('fructose', 'name')
compound.plot_3d()

# Solubility by SMILES

In [None]:
solubility_data = pd.read_csv('curated-solubility-dataset.csv')
solubility_data

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility,SD,Ocurrences,Group,MolWt,...,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumSaturatedRings,NumAliphaticRings,RingCount,TPSA,LabuteASA,BalabanJ,BertzCT
0,A-3,"N,N,N-trimethyloctadecan-1-aminium bromide",InChI=1S/C21H46N.BrH/c1-5-6-7-8-9-10-11-12-13-...,SZEMGTQCPRNXEG-UHFFFAOYSA-M,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,0.000000,1,G1,392.510,...,17.0,142.0,0.0,0.0,0.0,0.0,0.00,158.520601,0.000000e+00,210.377334
1,A-4,Benzo[cd]indol-2(1H)-one,InChI=1S/C11H7NO/c13-11-8-5-1-3-7-4-2-6-9(12-1...,GPYLCFQEKPUWLD-UHFFFAOYSA-N,O=C1Nc2cccc3cccc1c23,-3.254767,0.000000,1,G1,169.183,...,0.0,62.0,2.0,0.0,1.0,3.0,29.10,75.183563,2.582996e+00,511.229248
2,A-5,4-chlorobenzaldehyde,InChI=1S/C7H5ClO/c8-7-3-1-6(5-9)2-4-7/h1-5H,AVPYQKSLYISFPO-UHFFFAOYSA-N,Clc1ccc(C=O)cc1,-2.177078,0.000000,1,G1,140.569,...,1.0,46.0,1.0,0.0,0.0,1.0,17.07,58.261134,3.009782e+00,202.661065
3,A-8,"zinc bis[2-hydroxy-3,5-bis(1-phenylethyl)benzo...",InChI=1S/2C23H22O3.Zn/c2*1-15(17-9-5-3-6-10-17...,XTUPUYCJWKHGSW-UHFFFAOYSA-L,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,0.000000,1,G1,756.226,...,10.0,264.0,6.0,0.0,0.0,6.0,120.72,323.755434,2.322963e-07,1964.648666
4,A-9,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,InChI=1S/C25H30N2O4/c1-5-20(26(10-22-14-28-22)...,FAUAZXVRLVIARB-UHFFFAOYSA-N,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,0.000000,1,G1,422.525,...,12.0,164.0,2.0,4.0,4.0,6.0,56.60,183.183268,1.084427e+00,769.899934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9977,I-84,tetracaine,InChI=1S/C15H24N2O2/c1-4-5-10-16-14-8-6-13(7-9...,GKCBAIGFKIBETG-UHFFFAOYSA-N,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,-3.010000,0.000000,1,G1,264.369,...,8.0,106.0,1.0,0.0,0.0,1.0,41.57,115.300645,2.394548e+00,374.236893
9978,I-85,tetracycline,InChI=1S/C22H24N2O8/c1-21(31)8-5-4-6-11(25)12(...,OFVLGDICTFRJMM-WESIUVDSSA-N,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,-2.930000,0.000000,1,G1,444.440,...,2.0,170.0,1.0,0.0,3.0,4.0,181.62,182.429237,2.047922e+00,1148.584975
9979,I-86,thymol,InChI=1S/C10H14O/c1-7(2)9-5-4-8(3)6-10(9)11/h4...,MGSRCZKZVOBKFT-UHFFFAOYSA-N,c1(cc(ccc1C(C)C)C)O,-2.190000,0.019222,3,G5,150.221,...,1.0,60.0,1.0,0.0,0.0,1.0,20.23,67.685405,3.092720e+00,251.049732
9980,I-93,verapamil,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",SGTNSNPWRIOYBX-UHFFFAOYSA-N,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,-3.980000,0.000000,1,G1,454.611,...,13.0,180.0,2.0,0.0,0.0,2.0,63.95,198.569223,2.023333e+00,938.203977


# GNN

In [None]:
# # import library --------------------------------------------------------------
# from rdkit import Chem
# import networkx as nx
# import matplotlib.pyplot as plt

# # define the smiles string and covert it into a molecule sturcture ------------
# caffeine_smiles = 'C(C1C(C(C(C(O1)O)O)O)O)O'
# caffeine_mol = Chem.MolFromSmiles(caffeine_smiles)

# # define the function for coverting rdkit object to networkx object -----------     
# def mol_to_nx(mol):
#     G = nx.Graph()

#     for atom in mol.GetAtoms():
#         G.add_node(atom.GetIdx(),
#                    atomic_num=atom.GetAtomicNum(),
#                    is_aromatic=atom.GetIsAromatic(),
#                    atom_symbol=atom.GetSymbol())
        
#     for bond in mol.GetBonds():
#         G.add_edge(bond.GetBeginAtomIdx(),
#                    bond.GetEndAtomIdx(),
#                    bond_type=bond.GetBondType())
        
#     return G

# # conver rdkit object to networkx object --------------------------------------
# caffeine_nx = mol_to_nx(caffeine_mol)

# caffeine_atom = nx.get_node_attributes(caffeine_nx, 'atom_symbol')

# color_map = {'C': 'cyan',
#              'O': 'orange',
#              'N': 'magenta'}  

# caffeine_colors = []
# for idx in caffeine_nx.nodes():
#     if (caffeine_nx.nodes[idx]['atom_symbol'] in color_map):
#         caffeine_colors.append(color_map[caffeine_nx.nodes[idx]['atom_symbol']])
#     else:
#         caffeine_colors.append('gray')
    
# nx.draw(caffeine_nx,
#         labels=caffeine_atom,
#         with_labels = True,
#         node_color=caffeine_colors,
#         node_size=800)

# plt.show()

# # print out the adjacency matrix ---------------------------------------------- 
# matrix = nx.to_numpy_matrix(caffeine_nx)
# print(matrix)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9406bf03-f66f-4894-ad73-e6bff2a29f29' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>