In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [None]:
filePath = "/kaggle/input/breast-cancer-wisconsin-data/data.csv"
df = pd.read_csv(filePath)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.drop(['id', 'Unnamed: 32'], axis =1, inplace = True)
X = df.drop(['diagnosis'], axis = 1)

lbl_encoder = LabelEncoder()

y = pd.Series(
            lbl_encoder.fit_transform(
                                    df['diagnosis']))
X.shape, y.shape

In [None]:
X.info()

Feature Selection

In [None]:
# Check for Constant Features

const_features = VarianceThreshold(threshold = 0.001) # where 99.9% values are same
const_features.fit(X)
const_features.get_support().sum()

In [None]:
non_const_features = X.columns[const_features.get_support()]

In [None]:
# Check for Duplicate Features

x_transpose = X[non_const_features].T
x_transpose = pd.DataFrame(x_transpose)
x_transpose.shape
x_transpose.duplicated().sum()

In [None]:
# feature selection based on Random Classifier

roc_score = []
for feature in X[non_const_features].columns:
    clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
    clf.fit(X[feature].to_frame(), y)
    y_pred = clf.predict(X[feature].to_frame())
    roc_score.append(roc_auc_score(y, y_pred))

In [None]:
roc_score = pd.DataFrame(roc_score)
roc_score.index = X[non_const_features].columns
roc_score.columns = ['ROC_score']
roc_score.sort_values(by = 'ROC_score', ascending = False, inplace = True)
roc_score

In [None]:
roc_feature = X[roc_score[roc_score.ROC_score > 0.95].index].columns
len(roc_feature)

Selected Features

In [None]:
roc_feature

In [None]:
X[roc_feature]

# Model Building

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
class DummyEstimator(BaseEstimator):
        def fit(self): pass
        def score(self): pass

In [None]:
pipeline = Pipeline([
                    ('estimator', DummyEstimator())
])

lr = {
            'estimator' : [LogisticRegression()],
            'estimator__penalty': ['l2'],
            'estimator__C' : np.arange(1,100, 10),
            'estimator__random_state' : [0],
            'estimator__solver' : ['lbfgs', 'liblinear', 'sag', 'saga'],
            'estimator__max_iter' : np.arange(100,1000, 100),
            'estimator__n_jobs' : [-1]
}

sgd ={
            'estimator' : [SGDClassifier()],
            'estimator__penalty': ['l1', 'l2', 'elasticnet'],
            'estimator__alpha' : [i*10**(-exp) for exp in range(0, 5) for i in [1]],
            'estimator__random_state' : [0],
            'estimator__learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
            'estimator__eta0' : np.arange(1,100, 20),
            'estimator__max_iter' : np.arange(500,10000, 500),
            'estimator__n_jobs' : [-1]
}

est = [lr, sgd]

In [None]:
estimators = ['Logistic Regression', 'SGD Classifier']
index = 0
for estimator in est:
    models = GridSearchCV(estimator= pipeline, param_grid= estimator, n_jobs= -1, cv = 5, verbose= 0)
    models.fit(X_train, Y_train)
    print("----------------" + estimators[index] + "----------------")
    print("Best score : ", models.best_score_)
    score = models.score(X_test, Y_test)
    print("Test score", score)
    index += 1