In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import xgboost as xgb

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

%pylab inline

# Data importing and exploration

In [None]:
data = pd.read_csv('../input/wisconsin-diagnostic-breast-cancer-wdbc/data.csv', index_col='id')

In [None]:
data.head()

In [None]:
data = data.drop(data.columns[-1], axis=1)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data['diagnosis'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
def plots(columns):
    fig, axs = plt.subplots(3, 3, figsize=(16, 12))
    y = 0
    for col in columns:
        sns.distplot(data[col], ax = axs[0, y])
        sns.boxplot(y = col, data = data, palette='Accent', ax = axs[1, y])
        sns.scatterplot(x=data[col], y=data['diagnosis'], ax = axs[2, y])
        y = y+1

## Radius

In [None]:
plots(['radius_mean', 'radius_se', 'radius_worst'])

## Texture

In [None]:
plots(['texture_mean', 'texture_se', 'texture_worst'])

## Smoothness

In [None]:
plots(['smoothness_mean', 'smoothness_se', 'smoothness_worst'])

## Compactness

In [None]:
plots(['compactness_mean', 'compactness_se', 'compactness_worst'])

## Concavity

In [None]:
plots(['concavity_mean', 'concavity_se', 'concavity_worst'])

## Concave points

In [None]:
plots(['concave points_mean', 'concave points_se', 'concave points_worst'])

## Symmetry

In [None]:
plots(['symmetry_mean', 'symmetry_se', 'symmetry_worst'])

## Fractal dimension

In [None]:
plots(['fractal_dimension_mean', 'fractal_dimension_se', 'fractal_dimension_worst'])

## Area worst

In [None]:
cols = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

fig, axs = plt.subplots(10, 2, figsize=(18, 34))
y = 0
x = 0
for i in cols:
    sns.scatterplot(data = data, x=data[i], y=data['area_worst'], hue = 'diagnosis', ax = axs[x, y])
    y = y + 1
    if y == 2:
        y = 0
        x = x + 1

# Data preprocessing

In [None]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 1)

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model evaluating

In [None]:
parameters = {
    'penalty': ['l1', 'l2', 'none'],
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': np.arange(100, 200, 10),
    'C':np.arange(0, 1.5, 0.2)
}
grid_cv = GridSearchCV(estimator = LogisticRegression(random_state = 1), param_grid = parameters, cv = 10)

In [None]:
%%time
grid_cv.fit(X_train_scaled, y_train)

In [None]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

In [None]:
print(classification_report(y_test, grid_cv.best_estimator_.predict(X_test_scaled)))

In [None]:
grid_cv.best_estimator_.score(X_test_scaled, y_test)

In [None]:
y_score = grid_cv.best_estimator_.predict_proba(X_test_scaled)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label='M')

title('Logistic regression ROC curve')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed', color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))