# Import libraries

In [None]:
!pip install rdkit

In [None]:
import rdkit, rdkit.Chem, rdkit.Chem.Draw
from rdkit.Chem import Descriptors
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import pairwise_distances_argmin_min
import matplotlib

# Read the database

In [None]:
!wget https://raw.githubusercontent.com/stefano-bosio/CTF_ML_MD/main/kinase-examples.smi
!wget https://raw.githubusercontent.com/stefano-bosio/CTF_ML_MD/main/lgic-examples.smi

In [None]:
suppl_kinases=rdkit.Chem.rdmolfiles.SmilesMolSupplier('kinase-examples.smi')
kinases=[x for x in suppl_kinases]

In [None]:
suppl_lgic=rdkit.Chem.rdmolfiles.SmilesMolSupplier('lgic-examples.smi')
lgic=[x for x in suppl_lgic]

In [None]:
features_kinases=pd.DataFrame()
for i,mol in enumerate(kinases):
    features_kinases.loc[i,'MolWt']=Descriptors.MolWt(mol)
    features_kinases.loc[i,'NumHAcceptors']=Descriptors.NumHAcceptors(mol)
    features_kinases.loc[i,'NumHDonors']=Descriptors.NumHDonors(mol)
    features_kinases.loc[i,'NumRotBonds']=Descriptors.NumRotatableBonds(mol)
    features_kinases.loc[i,'NumHeteroatoms']=Descriptors.NumHeteroatoms(mol)
    features_kinases.loc[i,'FractionCSP3']=Descriptors.FractionCSP3(mol)
    features_kinases.loc[i,'RingCount']=Descriptors.RingCount(mol)
    features_kinases.loc[i,'TPSA']=Descriptors.TPSA(mol)
    features_kinases.loc[i,'Stereocenters']=rdkit.Chem.rdMolDescriptors.CalcNumAtomStereoCenters(mol)
    features_kinases.loc[i,'Spiro']=rdkit.Chem.rdMolDescriptors.CalcNumSpiroAtoms(mol)
    features_kinases.loc[i,'NumArR']=Descriptors.NumAromaticRings(mol)
    features_kinases.loc[i,'NumAliR']=Descriptors.NumAliphaticRings(mol)
    features_kinases.loc[i,'set']=0

In [None]:
features_kinases

In [None]:
features_lgic=pd.DataFrame()
for i,mol in enumerate(lgic):
    features_lgic.loc[i,'MolWt']=Descriptors.MolWt(mol)
    features_lgic.loc[i,'NumHAcceptors']=Descriptors.NumHAcceptors(mol)
    features_lgic.loc[i,'NumHDonors']=Descriptors.NumHDonors(mol)
    features_lgic.loc[i,'NumRotBonds']=Descriptors.NumRotatableBonds(mol)
    features_lgic.loc[i,'NumHeteroatoms']=Descriptors.NumHeteroatoms(mol)
    features_lgic.loc[i,'FractionCSP3']=Descriptors.FractionCSP3(mol)
    features_lgic.loc[i,'RingCount']=Descriptors.RingCount(mol)
    features_lgic.loc[i,'TPSA']=Descriptors.TPSA(mol)
    features_lgic.loc[i,'Stereocenters']=rdkit.Chem.rdMolDescriptors.CalcNumAtomStereoCenters(mol)
    features_lgic.loc[i,'Spiro']=rdkit.Chem.rdMolDescriptors.CalcNumSpiroAtoms(mol)
    features_lgic.loc[i,'NumArR']=Descriptors.NumAromaticRings(mol)
    features_lgic.loc[i,'NumAliR']=Descriptors.NumAliphaticRings(mol)
    features_lgic.loc[i,'set']=1

In [None]:
features_lgic

In [None]:
features=pd.concat((features_kinases,features_lgic))

In [None]:
features

# Feature Selection

### Scaling of variables

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
features_scaled=pd.DataFrame(MinMaxScaler().fit_transform(features), columns=features.columns)

## 1. Univariate Linear Filtering

In [None]:
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [None]:
array = features_scaled.values

In [None]:
X = array[:,0:12]
Y = array[:,12]
feat=features.columns

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
test = SelectKBest(score_func=mutual_info_classif)
fit = test.fit(X, Y)

In [None]:
scores=pd.DataFrame(feat[:-1],columns=['feat'])
scores['scores']=fit.scores_
scores.sort_values(by='scores', ascending=False, inplace=True)
print(scores)

## 2. Wrapping through Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(solver='lbfgs', max_iter=5000)
data=[]

for i in range(len(feat[:-1])):
  n_feat=len(feat[:-1])-i

  rfe = RFE(model, n_features_to_select=n_feat)   # seek for n_feat features
  fit = rfe.fit(X, Y)

  data.append((fit.ranking_))
  scores=pd.DataFrame(feat[:-1],columns=['feat'])
  scores['scores']=fit.ranking_
  scores.sort_values(by='scores',ascending=True,inplace=True)

  print(f"Model with {n_feat} features")
  print(scores)

In [None]:
model = LogisticRegression(solver='lbfgs', max_iter=5000)

rfe = RFECV(model)   # seek for best number of features
fit = rfe.fit(X, Y)

scores=pd.DataFrame(feat[:-1],columns=['feat'])
scores['scores']=fit.ranking_
scores.sort_values(by='scores',ascending=True,inplace=True)

print(f"Model with {fit.n_features_} features")
print(scores)

In [None]:
plt.plot(range(1,len(feat)),fit.cv_results_['mean_test_score'],marker='o')
plt.xlabel("# Features")
plt.ylabel("CV score")

# Feature Extraction aka Dimensionality Reduction

## Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA()

In [None]:
low_dim=pca.fit_transform(X)

In [None]:
low_dim.shape

In [None]:
low_dim[:,0].shape

In [None]:
plt.bar(np.linspace(0,len(pca.explained_variance_ratio_), num=len(pca.explained_variance_ratio_)),pca.explained_variance_ratio_)
plt.xlabel("Eigenvector #")
plt.ylabel("Explained Variance Ratio")

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(low_dim[:1999,0], low_dim[:1999,1])
plt.scatter(low_dim[2000:,0], low_dim[2000:,1])
plt.xlabel("PC1")
plt.ylabel("PC2")

In [None]:
from sklearn.cluster import *

In [None]:
kmeans=KMeans(n_clusters=8,random_state=8).fit(low_dim[:,:2])

In [None]:
np.unique(kmeans.labels_)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(low_dim[:,0], low_dim[:,1],c=kmeans.labels_)
plt.xlabel("PC1")
plt.ylabel("PC2")

In [None]:
tot=kinases+lgic

In [None]:
len(tot)

In [None]:
closest_points= pairwise_distances_argmin_min(kmeans.cluster_centers_, low_dim[:,:2])
closest_points

In [None]:
low_dim[closest_points[0]].shape

In [None]:
low_dim[closest_points[0],0]

In [None]:
low_dim[closest_points[0],1]

In [None]:
it=0
plt.figure(figsize=(8,8))
plt.scatter(low_dim[:,0], low_dim[:,1],c=kmeans.labels_)
for i in closest_points[0]:
  it+=1
  plt.scatter(low_dim[i,0], low_dim[i,1],marker='o',s=100,edgecolor="black",color="red" )
  plt.annotate(it, (low_dim[i,0]+0.05, low_dim[i,1]+0.05), color="black", fontsize=15)

plt.xlabel("PC1")
plt.ylabel("PC2")

In [None]:
subset=[]
for i in closest_points[0]:
  subset.append(tot[i])

In [None]:
subset[0]

In [None]:
img=rdkit.Chem.Draw.MolsToGridImage(subset,molsPerRow=4,subImgSize=(300,300), legends=[str(x) for x in np.arange(1,9,1)])
img

## Non-linear dimensionality reduction methods

### 1. ISOMAP

In [None]:
from sklearn.manifold import Isomap

In [None]:
iso=Isomap(n_neighbors=12)

In [None]:
data_red=iso.fit_transform(X)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(data_red[:1999,0], data_red[:1999,1])
plt.scatter(data_red[2000:,0], data_red[2000:,1])
plt.xlabel("ISOMAP1")
plt.ylabel("ISOMAP2")

In [None]:
kmeans=KMeans(n_clusters=8,random_state=8).fit(data_red[:,:2])

In [None]:
np.unique(kmeans.labels_)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(data_red[:,0], data_red[:,1],c=kmeans.labels_)
plt.xlabel("ISOMAP1")
plt.ylabel("ISOMAP2")

In [None]:
closest_points= pairwise_distances_argmin_min(kmeans.cluster_centers_, data_red[:,:2])
closest_points

In [None]:
it=0
plt.figure(figsize=(8,8))
plt.scatter(data_red[:,0], data_red[:,1],c=kmeans.labels_)
for i in closest_points[0]:
  it+=1
  plt.scatter(data_red[i,0], data_red[i,1],marker='o',s=100,edgecolor="black",color="red" )
  plt.annotate(it, (data_red[i,0]+0.1, data_red[i,1]+0.1), color="black", fontsize=15)

plt.xlabel("ISOMAP1")
plt.ylabel("ISOMAP2")

In [None]:
subset=[]
for i in closest_points[0]:
  subset.append(tot[i])

In [None]:
subset[0]

In [None]:
img=rdkit.Chem.Draw.MolsToGridImage(subset,molsPerRow=4,subImgSize=(300,300), legends=[str(x) for x in np.arange(1,9,1)])
img

### 2. t-SNE

In [None]:
from sklearn.manifold import  *

In [None]:
tsne=TSNE(perplexity=np.sqrt(len(X)))

In [None]:
data_red=tsne.fit_transform(X)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(data_red[:1999,0], data_red[:1999,1])
plt.scatter(data_red[2000:,0], data_red[2000:,1])
plt.xlabel("t-SNE1")
plt.ylabel("t-SNE2")

In [None]:
kmeans=KMeans(n_clusters=8,random_state=8).fit(data_red[:,:2])

In [None]:
np.unique(kmeans.labels_)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(data_red[:,0], data_red[:,1],c=kmeans.labels_)
plt.xlabel("t-SNE1")
plt.ylabel("t-SNE2")

In [None]:
closest_points= pairwise_distances_argmin_min(kmeans.cluster_centers_, data_red[:,:2])
closest_points

In [None]:
it=0
plt.figure(figsize=(8,8))
plt.scatter(data_red[:,0], data_red[:,1],c=kmeans.labels_)
for i in closest_points[0]:
  it+=1
  plt.scatter(data_red[i,0], data_red[i,1],marker='o',s=100,edgecolor="black",color="red" )
  plt.annotate(it, (data_red[i,0]+0.5, data_red[i,1]+0.5), color="black", fontsize=15)

plt.xlabel("t-SNE1")
plt.ylabel("t-SNE2")

In [None]:
subset=[]
for i in closest_points[0]:
  subset.append(tot[i])

In [None]:
subset[0]

In [None]:
img=rdkit.Chem.Draw.MolsToGridImage(subset,molsPerRow=4,subImgSize=(300,300), legends=[str(x) for x in np.arange(1,9,1)])
img

In [None]:
distance=np.sqrt(np.square(data_red[:,0]-(data_red[closest_points[0][2],0]))+np.square(data_red[:,1]-(data_red[closest_points[0][2],1])))

In [None]:
nearest_points=np.argsort(distance,axis=0)[:50]

In [None]:
nearest_points

In [None]:
from rdkit import Chem

In [None]:
Chem.MolToSmiles(tot[0])

In [None]:
nearest_mols=[]
unique_smiles=[]
for i in nearest_points:
  if Chem.MolToSmiles(tot[i]) not in unique_smiles:
    nearest_mols.append(tot[i])
    unique_smiles.append(Chem.MolToSmiles(tot[i]))


In [None]:
img=rdkit.Chem.Draw.MolsToGridImage(nearest_mols,molsPerRow=3,subImgSize=(300,300))
img

# Classification

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
x_reduced=data_red[:,0]

In [None]:
x_reduced

In [None]:
Y

In [None]:
clr=LogisticRegression()

In [None]:
clr.fit(x_reduced.reshape(-1,1),Y)

In [None]:
clr.coef_

In [None]:
clr.intercept_

In [None]:
plt.scatter(x_reduced,Y,alpha=0.05,c=Y,cmap=matplotlib.colors.ListedColormap(["C0", "C1"]))
plt.xlabel("t-SNE1")
plt.ylabel("Class")

In [None]:
x=np.linspace([x_reduced.min()-1,x_reduced.max()+1],100)

In [None]:
response=np.exp(clr.intercept_+clr.coef_*x)/(np.exp(clr.intercept_+clr.coef_*x)+1)

In [None]:
plt.scatter(x_reduced,Y,alpha=0.05,c=Y,cmap=matplotlib.colors.ListedColormap(["C0", "C1"]))
plt.plot(x,response,'--',c='k',lw=2)
plt.xlabel("t-SNE1")
plt.ylabel("Class")

## 2. Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
lda=LinearDiscriminantAnalysis()

In [None]:
x_reduced=data_red[:,:2]

In [None]:
lda.fit(x_reduced,Y)

In [None]:
feature_1, feature_2 = np.meshgrid(
    np.linspace(x_reduced[:, 0].min(), x_reduced[:, 0].max()),
    np.linspace(x_reduced[:, 1].min(), x_reduced[:, 1].max()))

In [None]:
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T

In [None]:
y_predict=np.reshape(lda.predict(grid),feature_1.shape)

In [None]:
display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_predict)

In [None]:
display.plot(cmap=matplotlib.colors.ListedColormap(["C0", "C1"]))
display.ax_.scatter(x_reduced[:, 0], x_reduced[:, 1], c=Y, edgecolor="k",cmap=matplotlib.colors.ListedColormap(["C0", "C1"]))
plt.xlabel("t-SNE1")
plt.ylabel("t-SNE2")

## 3. Support Vector Machines

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
svc=SVC(kernel='linear')
#svc=SVC(kernel='poly', degree=2)
#svc=SVC(kernel='poly', degree=3)
#svc=SVC(kernel='poly', degree=4)
#svc=SVC(kernel='sigmoid')
#svc=SVC(kernel='rbf')

In [None]:
svc.fit(x_reduced,Y)

In [None]:
y_predict=np.reshape(svc.predict(grid),feature_1.shape)

In [None]:
display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_predict)

In [None]:
display.plot(cmap=matplotlib.colors.ListedColormap(["C0", "C1"]))
display.ax_.scatter(x_reduced[:, 0], x_reduced[:, 1], c=Y, edgecolor="k",cmap=matplotlib.colors.ListedColormap(["C0", "C1"]))
plt.xlabel("t-SNE1")
plt.ylabel("t-SNE2")