# Data preprocessing of the Mcule dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit.Chem as Chem
from rdkit.Chem import Draw

## Import the dataset
If it is not working, put the file mcule_purchasable_in_stock_prices_230324_RKoqmy.csv in the folder /datasets

In [2]:
df = pd.read_csv('../datasets/mcule_purchasable_in_stock_prices_230324_RKoqmy.csv')

  df = pd.read_csv('../datasets/mcule_purchasable_in_stock_prices_230324_RKoqmy.csv')


In [3]:
df.head()

Unnamed: 0,Mcule ID,SMILES,stereo label,price 1 (USD),amount 1 (mg),delivery time 1 (w.days),available amount 1 (mg),price 2 (USD),amount 2 (mg),delivery time 2 (w.days),...,available amount 3 (mg),price 4 (USD),amount 4 (mg),delivery time 4 (w.days),available amount 4 (mg),price 5 (USD),amount 5 (mg),delivery time 5 (w.days),available amount 5 (mg),class
0,MCULE-5933021454,N1N=CNN=1,,46.22,1,21,1100.0,48.75,5.0,11.0,...,,141.7,100.0,16.0,,331.5,1000.0,16.0,,
1,MCULE-3707390029,C1(=NNC=N1)N(=O)=O,,5.53,1,14,,5.53,5.0,14.0,...,,5.53,100.0,14.0,,5.53,1000.0,14.0,,1.0
2,MCULE-2532563996,N(=O)(=O)C1=CC2=CC=CN=C2C(O)=C1,,39.52,1,16,6350.0,39.52,5.0,16.0,...,6350.0,39.52,100.0,16.0,6350.0,180.96,1000.0,16.0,6350.0,1.0
3,MCULE-6058343212,N(=O)(=O)C1C=CC(=C(O)C=1)Cl,,5.2,1,16,,5.2,5.0,16.0,...,,5.2,100.0,16.0,,5.2,1000.0,16.0,,1.0
4,MCULE-6510012271,C(C1C=CC=CC=1)(=O)C(CCC#N)(CCC#N)CCC#N,,88.9,1,12,26.0,96.99,5.0,12.0,...,,208.0,100.0,26.0,,228.15,1000.0,22.0,,


## Check the SMILES validity

In [4]:
def check_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

In [6]:
df['valid_smiles'] = df['SMILES'].apply(check_smiles)

[08:34:28] Explicit valence for atom # 0 Br, 4, is greater than permitted
[08:34:29] Explicit valence for atom # 4 Br, 3, is greater than permitted
[08:34:46] Explicit valence for atom # 3 N, 5, is greater than permitted
[08:34:47] Explicit valence for atom # 3 N, 5, is greater than permitted
[08:34:48] Explicit valence for atom # 19 N, 6, is greater than permitted
[08:34:48] Explicit valence for atom # 5 Cl, 3, is greater than permitted
[08:34:50] Explicit valence for atom # 0 N, 5, is greater than permitted
[08:35:36] Explicit valence for atom # 1 N, 5, is greater than permitted
[08:35:42] Explicit valence for atom # 0 N, 6, is greater than permitted
[08:35:43] Explicit valence for atom # 6 N, 5, is greater than permitted
[08:36:00] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[08:36:01] Explicit valence for atom # 0 N, 5, is greater than permitted
[08:36:11] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[08:36:39] Explicit valence for atom # 18 C

Show the invalid SMILES

In [7]:
invalid_df = df[df['valid_smiles'] == False]

## First model : Morgan fingerprints

In [10]:
valid_df = df[df['valid_smiles'] == True]
valid_df['mol'] =  valid_df['SMILES'].apply(Chem.MolFromSmiles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['mol'] =  valid_df['SMILES'].apply(Chem.MolFromSmiles)


In [12]:
# Calculate the fingerprints
from rdkit.Chem import AllChem
from rdkit import DataStructs

def get_fingerprint(mol):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

valid_df['fp'] = valid_df['mol'].apply(get_fingerprint)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['fp'] = valid_df['mol'].apply(get_fingerprint)


## Machine learning model : Multilinear regression

Split the dataset into training and test sets

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [40]:
def data_split(X, y):
    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    X_train, X_validation, y_train, y_validation = train_test_split(
        X_train, y_train, test_size=0.25, random_state=0)

    return X_train, X_test, y_train, y_test, X_validation, y_validation


X_train, X_test, y_train, y_test, X_validation, y_validation = data_split(valid_df['fp'], valid_df['price 1 (USD)'])

normalize the prices

In [46]:
def normalize_price(price):
    return (price - df_min) / (df_max - df_min)

def get_price_back(n_price):
    return n_price * (df_max - df_min) + df_min

df_min = y_train.min()
df_max = valid_df['price 1 (USD)'].max()

valid_df['n_price_1mg'] = valid_df['price 1 (USD)'].apply(normalize_price)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['n_price_1mg'] = valid_df['price 1 (USD)'].apply(normalize_price)
