In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, r2_score
from sklearn.gaussian_process.kernels import RBF, Matern

In [2]:
from src.config import *
from src.utils import per_error
from src.load_dataset import load_dataset
from src.load_models import select_model

In [3]:
%matplotlib
# Load Training Dataset
X_train, X_test, y_train, y_test = load_dataset()

Using matplotlib backend: <object object at 0x15705f260>
######Data Distribution:#########
Training {0: 23, 8: 25, 16: 21}
Testing {0: 15, 8: 16, 16: 15}
#################################


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={"PH": 'univariate, max(S)', 'signal_std':'univariate, std(S)', 'signal_mean':'univariate, mean(S)', 'peak area':'univariate, area(S)', \


In [None]:
%matplotlib
# Load Training Dataset
X_train, X_test, y_train, y_test = load_dataset()

In [4]:
len(X_train.columns.to_list())

13

In [5]:
# Convert the custom error function to a scorer
def per_error(y_test, y_pred)->float:
    y_LOD=1.6193237802284837
   
    mask           = (y_test != 0)    # Non Zero Concentration
    zero_mask      = ~(mask)          # Zero Concentration

    y_pred         = np.maximum(y_pred, 0.0)

    # Only for non zero concentration
    non_zero_per_error = np.abs(y_test[mask] - y_pred[mask])/(0.5*(y_test[mask] + y_pred[mask]))
   
    # zero concentration
    zero_per_error     = np.abs(y_test[zero_mask] - y_pred[zero_mask]) / y_LOD

    # assert not(np.isnan(zero_per_error).any())
    # assert not(np.isnan(non_zero_per_error).any())

    per_error         = np.concatenate((non_zero_per_error, zero_per_error))
    per_error         = np.mean(per_error) * 100

    return per_error
    


In [6]:
models = ['SVM', 'RF', 'KNN', 'GP', 'Ridge', 'Lasso']
metric = 'per_error'

# Define the parameter grid
param_grids = {'SVM':{
                    'C': [0.1, 1, 10, 100, 200],
                    'gamma': [1, 0.1, 0.01, 0.001, 0.0005, 0.0001],
                    'kernel': ['rbf']},
              
              'RF': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4] 
                     },
              
              'KNN': {
                        'n_neighbors': [3, 5, 7, 9],
                        'weights': ['uniform', 'distance'],
                        'metric': ['euclidean', 'manhattan']
                    },
              
              'GP': {'kernel': [1.0 * RBF(length_scale=1.0), 1.0 * RBF(length_scale=0.5), 1.0 * Matern(length_scale=1.0, nu=1.5)]},
               'Ridge': {'alpha': [0.001, 0.01, 0.1, 1.0, 1.5, 2.0]},
               'Lasso': {'alpha': [0.001, 0.01, 0.1, 1.0, 1.5, 2.0]}
              }

scorer = make_scorer((r2_score if metric=='r2' else per_error), greater_is_better=(True if metric=='r2' else False))

for model_name in models:

    print(model_name)
    # Create a base model
    estimator = select_model(model_name)

    param_grid = param_grids[model_name]
    
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1, scoring=scorer)
    
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Print the best parameters and best score
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_}")

    # Use the best estimator to make predictions
    best_svc = grid_search.best_estimator_
    # best_svc.fit(X_train, y_train)
    y_pred = best_svc.predict(X_test)
    
    print("R2 Score",r2_score(y_test, y_pred))
    print("% error", per_error(y_test, y_pred))

    print("******************************************")
    

SVM
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best Score: -19.094092404741474
R2 Score 0.858670347025452
% error 22.202404478530152
******************************************
RF
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: -10.437568176781735
R2 Score 0.8560558823529412
% error 17.05027931822751
******************************************
KNN
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}
Best Score: -12.015563001447026
R2 Score 0.8565426170468187
% error 16.39885099123309
******************************************
GP
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
Best Score: -18.154146633552024
R2 Score 0

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [None]:
estimator = select_model('SVM')
estimator.get_params()