In [47]:
import pandas as pd
import numpy as np
from lazypredict.Supervised import LazyClassifier as lc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [48]:
XV = pd.read_csv("../Data/X-values.csv", index_col=[0])
X = XV.drop(["bioactivity_class","Name","canonical_smiles", "molecule_chembl_id"],axis=1)
yc = pd.read_csv("../Data/Y-class.csv", index_col=[0])
yv = pd.read_csv("../Data/Y-values.csv", index_col=[0])

# Importing the data

### **Setting Output Labels**

Now, we will set output labels from pIC50 values into three classes based on thresholds

In [49]:
o = []
for i in range(len(yv)):
    if (yv.iloc[i][0])>7.0: o.append("Extremely Potent Candidate")
    elif (yv.iloc[i][0])>6.0: o.append("Possible Candidate")
    else: o.append("Unsuitable Candidate")
y = pd.DataFrame(data = o, columns = ['Suitability'])

In [50]:
print(yv.head(2))

   pIC50
0   5.14
1   5.03


In [21]:
print(yc.head(2))

  bioactivity_class
0      intermediate
1      intermediate


In [22]:
print(y.head(2))

            Suitability
0  Unsuitable Candidate
1  Unsuitable Candidate


In [51]:
y.describe()

Unnamed: 0,Suitability
count,133
unique,3
top,Unsuitable Candidate
freq,118


In [38]:
X.head(2)

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,MW,LogP,NumHDonors,NumHAcceptors
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,281.27,1.89,0.0,5.0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,415.59,3.81,0.0,2.0


In [52]:
for i in range(len(y)):
    if (y.iloc[i][0]=="Possible Candidate"):
        print(XV.iloc[i][882],"Possible")
    elif (y.iloc[i][0]=="Extremely Potent Candidate"):
        print(XV.iloc[i][882],"Potent")

CHEMBL365134 Possible
CHEMBL190743 Possible
CHEMBL358279 Possible
CHEMBL212454 Possible
CHEMBL212218 Possible
CHEMBL222840 Potent
CHEMBL222769 Potent
CHEMBL222735 Possible
CHEMBL222628 Possible
CHEMBL222893 Potent
CHEMBL225515 Potent
CHEMBL222234 Potent
CHEMBL426898 Possible
CHEMBL187460 Possible
CHEMBL363535 Possible


### **Applying Classifier on PubChem Descriptors**

We will now apply the lazy CLassifier on the PubChem Descriptors

In [42]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93 entries, 13 to 119
Columns: 885 entries, PubchemFP0 to NumHAcceptors
dtypes: float64(4), int64(881)
memory usage: 643.7 KB


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)
# print(X_test," ",y_test)
m = lc(verbose=0,ignore_warnings=True, custom_metric=None)
models, predictions = m.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 29/29 [00:04<00:00,  6.66it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.5,0.82,,0.6,0.11
RidgeClassifier,0.95,0.67,,0.93,0.07
KNeighborsClassifier,0.95,0.67,,0.93,0.07
BaggingClassifier,0.9,0.65,,0.9,0.11
DecisionTreeClassifier,0.72,0.59,,0.8,0.07
BernoulliNB,0.65,0.56,,0.75,0.07
NearestCentroid,0.57,0.53,,0.69,0.07
LabelPropagation,0.42,0.48,,0.55,0.07
LabelSpreading,0.42,0.48,,0.55,0.07
CalibratedClassifierCV,0.93,0.33,,0.89,1.26


### **Applying same Classifier on 2D Descriptors**

We will now apply the same lazy CLassifier on the 2D Descriptors

In [54]:
XV2 = pd.read_csv("../Data/Descriptors2D.csv", index_col=[0])
X_train2, X_test2, y_train, y_test = train_test_split(XV2, y, test_size=0.3, random_state=64)
# print(X_test," ",y_test)
m2 = lc(verbose=0, ignore_warnings=True, custom_metric=None)
models2, predictions2 = m.fit(X_train2, X_test2, y_train, y_test)
models2

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:03<00:00,  8.21it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BaggingClassifier,0.95,0.67,,0.93,0.11
KNeighborsClassifier,0.93,0.66,,0.91,0.06
LabelSpreading,0.7,0.58,,0.78,0.06
LabelPropagation,0.7,0.58,,0.78,0.06
QuadraticDiscriminantAnalysis,0.55,0.51,,0.66,0.09
LinearDiscriminantAnalysis,0.93,0.49,,0.93,0.11
SVC,0.93,0.33,,0.89,0.07
RidgeClassifierCV,0.93,0.33,,0.9,0.08
RidgeClassifier,0.93,0.33,,0.9,0.06
RandomForestClassifier,0.93,0.33,,0.9,0.3


In [56]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93 entries, 13 to 119
Columns: 885 entries, PubchemFP0 to NumHAcceptors
dtypes: float64(4), int64(881)
memory usage: 643.7 KB


In [55]:
X_train2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, CHEMBL196635 to CHEMBL227075
Columns: 780 entries, APC2D1_C_C to APC2D10_X_X
dtypes: float64(780)
memory usage: 567.4+ KB


In [62]:
X_train3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, CHEMBL196635 to CHEMBL227075
Columns: 307 entries, SubFP1 to SubFP307
dtypes: int64(307)
memory usage: 223.8+ KB


### **Applying same Classifier on SubStructure Descriptors**

We will now apply the same lazy CLassifier on the 2D Descriptors

In [61]:
XV3 = pd.read_csv("../Data/DescriptorsSS.csv", index_col=[0])
X_train3, X_test3, y_train, y_test = train_test_split(XV3, y, test_size=0.3, random_state=64)
# print(X_test," ",y_test)
m3 = lc(verbose=0, ignore_warnings=True, custom_metric=None)
models3, predictions3 = m.fit(X_train3, X_test3, y_train, y_test)
models3

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:02<00:00, 13.38it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.75,0.75,,0.82,0.07
LGBMClassifier,0.95,0.67,,0.93,0.08
RandomForestClassifier,0.93,0.66,,0.91,0.29
ExtraTreesClassifier,0.9,0.65,,0.9,0.27
BaggingClassifier,0.85,0.63,,0.87,0.08
SGDClassifier,0.85,0.63,,0.87,0.02
RidgeClassifierCV,0.85,0.63,,0.87,0.05
ExtraTreeClassifier,0.82,0.62,,0.86,0.04
LogisticRegression,0.82,0.62,,0.86,0.08
AdaBoostClassifier,0.82,0.61,,0.86,0.22
