# Import libraries

In [None]:
!pip install rdkit

In [None]:
import rdkit, rdkit.Chem, rdkit.Chem.Draw
from rdkit.Chem import Descriptors
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import colormaps
from rdkit.Chem import Descriptors
from pandas.plotting import scatter_matrix

# Read the database

In [None]:
url_corso= 'https://raw.githubusercontent.com/stefano-bosio/CTF_ML_MD/main/Lipophilicity.csv'

In [None]:
data = pd.read_csv(url_corso)

In [None]:
data

In [None]:
molecules=[rdkit.Chem.MolFromSmiles(smi) for smi in data['smiles']]

In [None]:
molecules[10]

In [None]:
subset=[x for x in molecules[:9]]

In [None]:
img=rdkit.Chem.Draw.MolsToGridImage(subset,molsPerRow=3,subImgSize=(300,300))
img

In [None]:
valid_mols=[x for x in molecules if x != None]

In [None]:
len(molecules),len(valid_mols)

In [None]:
features=pd.DataFrame()
for i,mol in enumerate(molecules):
    features.loc[i,'MolWt']=Descriptors.MolWt(mol)
    features.loc[i,'NumHAcceptors']=Descriptors.NumHAcceptors(mol)
    features.loc[i,'NumHDonors']=Descriptors.NumHDonors(mol)
    features.loc[i,'NumRotBonds']=Descriptors.NumRotatableBonds(mol)
    features.loc[i,'NumHeteroatoms']=Descriptors.NumHeteroatoms(mol)
    features.loc[i,'FractionCSP3']=Descriptors.FractionCSP3(mol)
    features.loc[i,'RingCount']=Descriptors.RingCount(mol)
    features.loc[i,'TPSA']=Descriptors.TPSA(mol)
    features.loc[i,'Stereocenters']=rdkit.Chem.rdMolDescriptors.CalcNumAtomStereoCenters(mol)
    features.loc[i,'Spiro']=rdkit.Chem.rdMolDescriptors.CalcNumSpiroAtoms(mol)



In [None]:
features

# Scaling of variables

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
features_scaled=pd.DataFrame(MinMaxScaler().fit_transform(features), columns=features.columns)

In [None]:
print(features.min(),"\n")
print(features.max(),"\n")
print(features_scaled.min(),"\n")
print(features_scaled.max(),"\n")

# Compute correlation

In [None]:
correlations=features_scaled.corr()
correlations

In [None]:
ticks=[x for x  in features_scaled.columns]

In [None]:
ticks

In [None]:
plt.figure(figsize=(8,8))
plt.matshow(np.array(correlations), fignum=0, cmap='RdBu')
plt.xticks(np.linspace(0,9,10),ticks,rotation= 90)
plt.yticks(np.linspace(0,9,10),ticks)
plt.colorbar()
plt.clim((-1,1))

In [None]:
scatter_matrix(features_scaled, figsize=(20,20))
plt.show()

# Feature Selection

## 1. Univariate Linear Filtering



In [None]:
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest, r_regression
import pandas as pd

In [None]:
X = np.array(features_scaled)
Y = data['exp']
feat=features.columns

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
test=r_regression(X,Y)

In [None]:
scores=pd.DataFrame(feat,columns=['feat'])
scores['scores']=test
scores.sort_values(by='scores', ascending=False, inplace=True)
print(scores)

## 2. Wrapping through Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
data=[]

for i in range(len(feat)):
  n_feat=len(feat)-i

  rfe = RFE(model, n_features_to_select=n_feat)   # seek for n_feat features
  fit = rfe.fit(X, Y)

  data.append((fit.ranking_))
  scores=pd.DataFrame(feat,columns=['feat'])
  scores['scores']=fit.ranking_
  scores.sort_values(by='scores',ascending=True,inplace=True)

  print(f"Model with {n_feat} features")
  print(scores)
  print('\n')

In [None]:
rfe = RFECV(model)   # seek for best number of features
fit = rfe.fit(X, Y)

scores=pd.DataFrame(feat,columns=['feat'])
scores['scores']=fit.ranking_
scores.sort_values(by='scores',ascending=True,inplace=True)

print(f"Model with {fit.n_features_} features")
print(scores)

In [None]:
fit.cv_results_['mean_test_score']

In [None]:
plt.plot(range(1,len(feat)+1),fit.cv_results_['mean_test_score'],marker='o')
plt.errorbar(x=fit.cv_results_["n_features"], y=fit.cv_results_["mean_test_score"], yerr=fit.cv_results_["std_test_score"],c='Tab:Blue')

plt.xlabel("# Features")
plt.ylabel("CV score")

plt.grid()