# Import libraries

In [None]:
!pip install rdkit

In [2]:
import rdkit, rdkit.Chem, rdkit.Chem.Draw
from rdkit.Chem import Descriptors
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import colormaps
from rdkit.Chem import Descriptors
from pandas.plotting import scatter_matrix

# Read the database

In [3]:
url_corso= 'https://raw.githubusercontent.com/stefano-bosio/CTF_ML_MD/main/Lipophilicity.csv'

In [4]:
data = pd.read_csv(url_corso)

In [None]:
data

In [8]:
molecules=[rdkit.Chem.MolFromSmiles(smi) for smi in data['smiles']]

In [None]:
molecules[10]

In [16]:
subset=[x for x in molecules[:9]]

In [None]:
img=rdkit.Chem.Draw.MolsToGridImage(subset,molsPerRow=3,subImgSize=(300,300))
img

In [18]:
valid_mols=[x for x in molecules if x != None]

In [None]:
len(molecules),len(valid_mols)

In [21]:
features=pd.DataFrame()
for i,mol in enumerate(molecules):
    features.loc[i,'MolWt']=Descriptors.MolWt(mol)
    features.loc[i,'NumHAcceptors']=Descriptors.NumHAcceptors(mol)
    features.loc[i,'NumHDonors']=Descriptors.NumHDonors(mol)
    features.loc[i,'NumRotBonds']=Descriptors.NumRotatableBonds(mol)
    features.loc[i,'NumHeteroatoms']=Descriptors.NumHeteroatoms(mol)
    features.loc[i,'FractionCSP3']=Descriptors.FractionCSP3(mol)
    features.loc[i,'RingCount']=Descriptors.RingCount(mol)
    features.loc[i,'TPSA']=Descriptors.TPSA(mol)
    features.loc[i,'Stereocenters']=rdkit.Chem.rdMolDescriptors.CalcNumAtomStereoCenters(mol)
    features.loc[i,'Spiro']=rdkit.Chem.rdMolDescriptors.CalcNumSpiroAtoms(mol)



In [None]:
features

# Scaling of variables

In [23]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [44]:
features_scaled=pd.DataFrame(MinMaxScaler().fit_transform(features), columns=features.columns)

In [None]:
print(features.min(),"\n")
print(features.max(),"\n")
print(features_scaled.min(),"\n")
print(features_scaled.max(),"\n")

# Compute correlation

In [None]:
correlations=features_scaled.corr()
correlations

In [47]:
ticks=[x for x  in features_scaled.columns]

In [None]:
ticks

In [None]:
plt.figure(figsize=(8,8))
plt.matshow(np.array(correlations), fignum=0, cmap='RdBu')
plt.xticks(np.linspace(0,9,10),ticks,rotation= 90)
plt.yticks(np.linspace(0,9,10),ticks)
plt.colorbar()
plt.clim((-1,1))

In [None]:
scatter_matrix(features_scaled, figsize=(20,20))
plt.show()