In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.head()

In [None]:
X = data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis = 1)

y = data['diagnosis']

In [None]:
# X.isnull().sum()

In [None]:
X.shape, y.shape

# Build Logistic Regression
Lets first Build Logistic Regression Model.

In [None]:
# lr = LogisticRegression()
lr = LogisticRegression(max_iter=5000)

In [None]:
lr.fit(X,y)

# Check Accuracy

In [None]:
lr.score(X,y)

# Build Logistic Regression with Hyperparameter
Now lets build the Logistic Regression model with Hyperparameter, and will be using GridSearchCV to achive this.

Defining the hyper-parameters.

The details on these parameters can be checked from https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html 

In [None]:
params = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],   # Used to specify the norm used in the penalization.
    'C' : np.logspace(-4, 4, 20),                      # Inverse of regularization strength; must be a positive float.
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],  # Algorithm to use in the optimization problem.
    'max_iter' : [100, 1000,2500, 5000]                # Maximum number of iterations taken for the solvers to converge.
    }
]

# There are many other parameters that we could use... but for nw will start with this.

As we will be using GridSearchCV we have to import it first.

For more details on GridSearchCV, refer https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html?highlight=gridsearchcv#sklearn.model_selection.GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
clf = GridSearchCV(estimator = lr, param_grid = params, scoring = 'accuracy', cv = 3, verbose=True, n_jobs=-1)
# cv --> Determines the cross-validation splitting strategy
# verbose --> Controls the verbosity. Verbose is a general programming term for produce lots of logging output. You can think of it as asking the program to "tell me everything about what you are doing all the time". 
# n_jobs --> Number of jobs to run in parallel. `-1` means using all processors. 

In [None]:
clf_fit = clf.fit(X,y)

In [None]:
# Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data.
clf_fit.best_estimator_

In [None]:
clf_fit.score(X,y)
# Returns the score on the given data.
# This uses the score defined by scoring where provided, and the best_estimator_.score method otherwise.

In [None]:
# Mean cross-validated score of the best_estimator
clf_fit.best_score_

So we have seen that the Logistic Regression has resulted as ~95% but with the Hyper-Parameter it has scored as ~98%.