In [2]:
###############################################################
##########  Hyperparameter Tuning with GridSearchCV ###########
###############################################################
#We'll build a lasso regression model with optimal hyperparameters 
#to predict blood glucose levels using the features in the diabetes_df dataset.

import pandas as pd
import numpy as np

diabetes_df = pd.read_csv("diabetes_clean.csv")
diabetes_df


Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [19]:
#Lasso is a linear Regression model 

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold

X = diabetes_df.drop("glucose", axis=1).values
y = diabetes_df["glucose"].values

# Split the data into training and test sets, 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Instantiate the model
lasso = Lasso()

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Set up the parameter grid
param_grid = {"alpha": np.linspace(0.00001, 1, 20)}

# Instantiate lasso_cv
lasso_cv = GridSearchCV(lasso, param_grid, cv=kf)

# Fit to the training data
lasso_cv.fit(X_train, y_train)

print("Tuned lasso paramaters: {}".format(lasso_cv.best_params_))
print("Tuned lasso score: {}".format(lasso_cv.best_score_))

#Unfortunately, the best model only has an R-squared score of 0.33, 
#highlighting that using the optimal hyperparameters does not guarantee a high performing model!


Tuned lasso paramaters: {'alpha': 1e-05}
Tuned lasso score: 0.3307880723812198


In [32]:
###############################################################
#####   Hyperparameter tuning with RandomizedSearchCV  ########
###############################################################
import warnings
warnings.filterwarnings('ignore')
#GridSearchCV can be computationally expensive, 
#especially if we are searching over a large hyperparameter space. 

#In this case, we can use RandomizedSearchCV, which tests a fixed number of hyperparameter 
#settings from specified probability distributions
#You will define a range of hyperparameters and use RandomizedSearchCV, 
#to look for optimal hyperparameters from these options.


#We'll used a logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, KFold

X = diabetes_df.drop("diabetes", axis=1).values
y = diabetes_df["diabetes"].values

# Split the data into training and test sets, 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Instantiate the model
logreg = LogisticRegression()

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create the parameter space
params = {"penalty": ["l1", "l2"],
         "tol": np.linspace(0.0001, 1.0, 50),
         "C": np.linspace(0.1, 1.0, 50),
         "class_weight": ["balanced", {0:0.8, 1:0.2}]}

# Instantiate the RandomizedSearchCV object
logreg_cv = RandomizedSearchCV(logreg, params, cv=kf)

# Fit the data to the model
logreg_cv.fit(X_train, y_train)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Best Accuracy Score: {}".format(logreg_cv.best_score_))

#Even without trying every combination of hyperparameters, 
#the model has an accuracy of over 70% on the test set! 


Tuned Logistic Regression Parameters: {'tol': 0.2041612244897959, 'penalty': 'l2', 'class_weight': 'balanced', 'C': 0.1}
Tuned Logistic Regression Best Accuracy Score: 0.7460082633613221
