## Step 0: Setup

In [1]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

## Step 1: Load Dataset

In [2]:
iris = load_iris()

In [3]:
for key in iris:
    print(key)

data
target
frame
target_names
DESCR
feature_names
filename
data_module


In [4]:
X = iris.data
y = iris.target

In [7]:
# data as pandas DataFrame
import pandas as pd
df = pd.DataFrame(X, columns = iris.feature_names)
df['target'] = y
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Step 2: initialise classification model

In [8]:
model = LogisticRegression(max_iter = 1000)

## Step 3: Hyperparameter Tuning setting using GridSearch CV

In [9]:
# hyper parameter grid to search

param_grid = {
    "C" : [0.01, 0.1, 1, 10],           # 'C' is a hyperparameter in logistic regression, which determine the regularization strength
    "solver" : ['lbfgs', 'liblinear']   
    # solver is an optimization algorithm, it determines "which optimzation startegy should sklearn use to minimize the log loss? "
    #  used to actually find the best model parameters (the weights)
}

In [13]:
# GridSearch with 5-fold CV
grid = GridSearchCV(
    estimator = model,          # model
    param_grid = param_grid,   # hyperparameter grid
    cv = 5,                    # 5-fold cross validation
    scoring = "accuracy"       # performance metric
)

In [11]:
print(grid)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'solver': ['lbfgs', 'liblinear']},
             scoring='accuracy')


### fitting the model
It will do the hyperparameter tuning as well

In [14]:
grid.fit(X, y)

## Step 4: Results

In [15]:
print(grid)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'solver': ['lbfgs', 'liblinear']},
             scoring='accuracy')


In [18]:
print("Best Hyperparameter value : ", grid.best_params_)

Best Hyperparameter value :  {'C': 1, 'solver': 'lbfgs'}


In [26]:
print("Best Mean Cross validation Score : ", grid.best_score_)

Best Mean Cross validation Score :  0.9733333333333334


i.e when `C = 1`, and `solver = 'lbfgs'`, the mean accuracy is the best