# 1. Import libraries

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor



In [2]:
df = pd.read_csv('cyp1a2_06_bioactivity_data_3class_pIC50_pubchem_fp_dataset.csv')
df

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.85
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.75
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.45
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.90
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12763,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.52
12764,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.79
12765,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.30
12766,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.70


# 2. Load the data set

In [3]:
X = df.drop('pIC50', axis=1)
Y = df.pIC50

In [4]:
X.shape

(12768, 881)

In [10]:
Y.shape

(12768,)

# 3. Data pre-processing

In [5]:
# Remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X = selection.fit_transform(X)
X.shape

(12768, 156)

In [6]:
# Perform data splitting using 80/20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
X_train.shape, X_test.shape

((10214, 156), (2554, 156))

In [12]:
Y_train.shape, Y_test.shape

((10214,), (2554,))

# 4. Compare ML algorithms

In [13]:
# Defines and builds the lazyclassifier
clf = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
train, test = clf.fit(X_train, X_test, Y_train, Y_test)

100%|██████████| 42/42 [02:16<00:00,  3.25s/it]


In [14]:
train

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,-0.06,0.0,0.75,0.27
BayesianRidge,-0.06,0.0,0.75,0.19
TweedieRegressor,-0.06,0.0,0.75,0.14
GeneralizedLinearRegressor,-0.06,0.0,0.75,0.1
LassoLarsIC,-0.06,0.0,0.75,0.15
LassoCV,-0.06,0.0,0.75,3.76
ElasticNetCV,-0.06,0.0,0.75,4.62
LassoLarsCV,-0.06,0.0,0.75,0.43
LarsCV,-0.06,0.0,0.75,0.62
OrthogonalMatchingPursuitCV,-0.06,0.0,0.75,0.28


In [15]:
test

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,-0.06,0.0,0.75,0.27
BayesianRidge,-0.06,0.0,0.75,0.19
TweedieRegressor,-0.06,0.0,0.75,0.14
GeneralizedLinearRegressor,-0.06,0.0,0.75,0.1
LassoLarsIC,-0.06,0.0,0.75,0.15
LassoCV,-0.06,0.0,0.75,3.76
ElasticNetCV,-0.06,0.0,0.75,4.62
LassoLarsCV,-0.06,0.0,0.75,0.43
LarsCV,-0.06,0.0,0.75,0.62
OrthogonalMatchingPursuitCV,-0.06,0.0,0.75,0.28


In [7]:
# Defines and builds the lazyclassifier
clf = LazyRegressor (verbose = 0, ignore_warnings = True, custom_metric = None)
models_train, predictions_train = clf.fit (X_train, X_train, Y_train, Y_train)
models_test, predictions_test = clf.fit (X_train, X_test, Y_train, Y_test)

100%|██████████| 42/42 [03:01<00:00,  4.31s/it]
100%|██████████| 42/42 [00:00<00:00, 583313.80it/s]


In [8]:
# Performance table of the training set (80% subset)
predictions_train

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.84,0.84,0.3,16.15
DecisionTreeRegressor,0.84,0.84,0.3,0.29
ExtraTreeRegressor,0.84,0.84,0.3,0.29
GaussianProcessRegressor,0.84,0.84,0.3,26.85
RandomForestRegressor,0.73,0.73,0.39,11.37
BaggingRegressor,0.68,0.68,0.42,1.31
MLPRegressor,0.52,0.53,0.51,12.18
XGBRegressor,0.51,0.52,0.52,1.14
LGBMRegressor,0.2,0.22,0.66,0.33
KNeighborsRegressor,0.2,0.21,0.67,23.11


In [9]:
# Performance table of the test set (20% subset)
predictions_test

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
