In [1]:
## Importing required libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [4]:
import sys


In [22]:
## Other imports

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [23]:
## Reading the data
qsar = pd.read_csv('data/qsar_fish_toxicity2.csv', sep=';', header=None)
qsar.columns = ['CICO', 'SM1_Dz(Z)', 'GATS1i', 'NdsCH', 'NdssC', 'MLOGP', 'LC50(mol/L)']

## Looking into the data
df = qsar.copy()
df.head()

Unnamed: 0,CICO,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,LC50(mol/L)
0,3.26,0.829,1.676,0,1,1.453,3.77
1,2.189,0.58,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.51
4,2.094,0.827,0.86,0,0,1.886,5.39


In [24]:
## Splitting the data into independent features and dependent features
X, y = df.drop('LC50(mol/L)', axis=1), df['LC50(mol/L)']

## Splitting the data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=34, stratify=X[['NdsCH']])

In [25]:
## Creating a pipeline for data preprocessing
preprocess_pipe = Pipeline(steps=[
    ('scaling', StandardScaler())
])

In [27]:
## Creating objects of the machine learning algorithms that we want to try

## model 1: Support Vector Regression
svr = SVR()

## model 2: Nearest Neighbors Regression
knr = KNeighborsRegressor()

## model 3: Decision Tree Regression
dtr = DecisionTreeRegressor(max_depth=4,random_state=123)

## model 4: Random Forest Regression
rfr = RandomForestRegressor()


## model 6: AdaBoost Regressor
abr = AdaBoostRegressor(random_state=123)

## model 7: Gradient Boosting Regressor
gbr = GradientBoostingRegressor()


## model 9: voting regressor of model 1, model 2 and model 4
vr = VotingRegressor(estimators = [('svr', svr), ('knr', knr), ('rfr', rfr), ('gbr', gbr)])


In [28]:
## svr pipeline
svr_pipe = Pipeline(steps=[
    ('preprocess_pipe',preprocess_pipe),
    ('svr_model',svr)
])

## knr pipeline
knr_pipe = Pipeline(steps=[
    ('preprocess_pipe',preprocess_pipe),
    ('knr_model',knr)
])

## dtr pipeline
dtr_pipe = Pipeline(steps=[
    ('preprocess_pipe',preprocess_pipe),
    ('dtr_model',dtr)
])

## rfr pipeline
rfr_pipe = Pipeline(steps=[
    ('preprocess_pipe',preprocess_pipe),
    ('rfr_model',rfr)
])


## abr pipeline
abr_pipe = Pipeline(steps=[
    ('preprocess_pipe',preprocess_pipe),
    ('abr_model', abr)
])

## gbr pipeline
gbr_pipe = Pipeline(steps=[
    ('preprocess_pipe',preprocess_pipe),
    ('gbr_model', gbr)
])

## vr pipeline
vr_pipe = Pipeline(steps=[
    ('preprocess_pipe', preprocess_pipe),
    ('vr_model', vr)
])




In [32]:
## Finding out the best model using the cross validation  

pipelines = [svr_pipe, knr_pipe, dtr_pipe, rfr_pipe,abr_pipe, gbr_pipe, vr_pipe]
models = ['SupportVectorRegressor', 'KneighborsRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor',  'AdaboostRegressor', 'GradientBoostRegressor', 'VotingRegressor', ]
cv = KFold(n_splits=5)
for index, pipeline in enumerate(pipelines):
    cv_list = cross_val_score(estimator=pipeline, X=X, y=y,cv=cv,scoring='r2')
    mean = np.round(cv_list.mean(),3)
    std = np.round(cv_list.std(),3)
    print(f"The cross validation score for the model {models[index]} is {mean} +/- {std}.")

The cross validation score for the model SupportVectorRegressor is 0.614 +/- 0.065.
The cross validation score for the model KneighborsRegressor is 0.583 +/- 0.077.
The cross validation score for the model DecisionTreeRegressor is 0.445 +/- 0.108.
The cross validation score for the model RandomForestRegressor is 0.596 +/- 0.081.
The cross validation score for the model AdaboostRegressor is 0.485 +/- 0.071.
The cross validation score for the model GradientBoostRegressor is 0.589 +/- 0.097.
The cross validation score for the model VotingRegressor is 0.631 +/- 0.079.
