In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import tarfile
import urllib


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries:

## Libraries for EDA & Feature-Selection:

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
!pip install seaborn --upgrade
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
!pip install swiftviz
import swiftviz as sv
!pip install pandas_profiling
import pandas_profiling

# Feature Selection
!pip install ppscore
import ppscore as pps

In [None]:
sns.__version__

## Libraries for Preprocessing & Model-Selection:

In [None]:
# Library for pre-processing:
from sklearn.preprocessing import StandardScaler

# Library for Dimensionality-Reduction:
from sklearn.decomposition import PCA

# Libraries for modelling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
!pip install catboost
from catboost import CatBoostClassifier


# Model Selection:
from sklearn.pipeline import Pipeline as Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import clone

# Libraries for model evaluaton 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict

# --CLASSIFICATION:
from sklearn import metrics

# Library for plotting confusion matrix
from mlxtend.plotting import plot_confusion_matrix

# Miscellanous libraries
from IPython.display import display

## Loading Dataset:

In [None]:
bc_df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
bc_df.head()

In [None]:
bc_df.columns

In [None]:
bc_df.drop(columns=['id', 'Unnamed: 32'], inplace=True)

The data we have consists of mean, standard deviation, and worst measure (average of top 3 maximum values). From this we can already judge that, features will have very strong correlation between them. But, Let's check for null values.

In [None]:
bc_df.info()

### No Null-values & every column has the correct data-type. All good!

In [None]:
report = bc_df.describe().T
report

In [None]:
features = np.array(report.index)
target = ['diagnosis']
features

# Data Exploration:

#### Setting default custom palette:

In [None]:
cust_palette = sv.CustomPalette()
cust_palette.display_palette('Dark2')
cust_palette.set_default_custom_palette('Dark2')

In [None]:
print(bc_df["diagnosis"].value_counts())
sns.countplot(x=bc_df["diagnosis"]);

In [None]:
axis = sv.Plotter.row_col_merge(rows=2, cols=5)

### Distributions of Means:

In [None]:
cust_palette.set_default_custom_palette('dark')
axis_var_list = list(zip(axis, features[:10]))

fig, axs = plt.subplots(figsize=(30, 10), nrows=2, ncols=5)

for axis_tup, var in axis_var_list:
    row = axis_tup[0]
    col = axis_tup[1]
    sns.histplot(data=bc_df, x=var, ax=axs[row, col], hue="diagnosis")

### Distribution of Standard Errors:

In [None]:
cust_palette.set_default_custom_palette('Set1')
axis_var_list = list(zip(axis, features[10:20]))

fig, axs = plt.subplots(figsize=(30, 10), nrows=2, ncols=5)

for axis_tup, var in axis_var_list:
    row = axis_tup[0]
    col = axis_tup[1]
    sns.histplot(data=bc_df, x=var, ax=axs[row, col], hue="diagnosis")

### Distributions of Worst:

In [None]:
cust_palette.set_default_custom_palette('Set2')
axis_var_list = list(zip(axis, features[20:]))

fig, axs = plt.subplots(figsize=(30, 10), nrows=2, ncols=5)

for axis_tup, var in axis_var_list:
    row = axis_tup[0]
    col = axis_tup[1]
    sns.histplot(data=bc_df, x=var, ax=axs[row, col], hue="diagnosis")

## Outlier Analysis:

### Remember: 
Larger the distance between the two medians('M' & 'B'), more useful the feature.<br>
This is because larger the distance between the medians<br>
means larger the distance between clusters, this will make the clusters to be more distinguishable.

In [None]:
outlier_handler = sv.OutlierAnalysis(bc_df)
scaled_bc_df = outlier_handler.data_scaler(list(features))

### Outlier Report For Means:

In [None]:
cust_palette.set_default_custom_palette('ocean_r')
data_mean = pd.melt(pd.concat([scaled_bc_df[features[:10]], bc_df[target]], axis=1), 
                    id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

fig, axs = plt.subplots(figsize=(30, 10))

display(sns.violinplot(data=data_mean, x="features", y="value", hue="diagnosis", inner="quart",
                       split=True));

mean_outlier_report = outlier_handler.OutlierReport(list(features[:10]))
mean_outlier_report

The table shown is for the entire dataset, and the plots are shown to visualize w.r.t "diagnosis".

### Outlier Report for Standard Errors:

In [None]:
data_se = pd.melt(pd.concat([scaled_bc_df[features[10:20]], bc_df[target]], axis=1), 
                    id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

fig, axs = plt.subplots(figsize=(30, 10))

display(sns.violinplot(data=data_se, x="features", y="value", hue="diagnosis", inner="quart",
                       split=True, palette=sns.color_palette('Dark2')[3:6:2]));

se_outlier_report = outlier_handler.OutlierReport(list(features[10:20]))
se_outlier_report

### Outlier Report for Worst:

In [None]:
data_worst = pd.melt(pd.concat([scaled_bc_df[features[20:30]], bc_df[target]], axis=1), 
                    id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

fig, axs = plt.subplots(figsize=(30, 10))

display(sns.violinplot(data=data_worst, x="features", y="value", hue="diagnosis", inner="quart",
                       split=True, palette=sns.color_palette('viridis')[3:6:2]));

worst_outlier_report = outlier_handler.OutlierReport(list(features[20:30]))
worst_outlier_report

## Random Visualizations for Feature Selections:

We will use this visualizations for determining whether to use pearson's correlation or spearman's correlation

In [None]:
cust_palette.set_default_custom_palette(["#f7a400", "#3a9efd", "#3e4491", "#292a73", "#1a1b4b"])
plot_maker = sv.Plotter()
plot_maker.plotter(bc_df, 'radius_mean', ['texture_mean', 'perimeter_mean', 'smoothness_mean', 'concavity_mean', 'fractal_dimension_mean', 'area_mean'],
                   ['diagnosis'], 30, 7)

In [None]:
plot_maker.plotter(bc_df, 'radius_se', ['texture_se', 'perimeter_se', 'smoothness_se', 'concavity_se', 'fractal_dimension_se', 'area_se'], ['diagnosis'], 30, 5)

In [None]:
plot_maker.plotter(bc_df, 'radius_worst', ['texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'fractal_dimension_worst'], ['diagnosis'], 30, 5)

In [None]:
plot_maker.plotter(bc_df, 'radius_mean', ['radius_se', 'radius_worst', 'area_se', 'area_worst', 'perimeter_se', 'perimeter_worst'], ['diagnosis'], 30, 5)

## Heatmap: (Using Pearson's Corr)

In [None]:
corr_matrix = bc_df.corr()

fig, axs = plt.subplots(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=True)

Let's filter out the values using a threshold, it will be a lot easier for us to find strong correlations:

In [None]:
threshold = 0.8
cond_corr_matrix = corr_matrix[(corr_matrix > threshold) | (corr_matrix < -threshold)]

fig, axs = plt.subplots(figsize=(20, 10))
sns.heatmap(cond_corr_matrix, annot=True)

We can see many strong correlations from the matrix, they are:

* radius_mean, perimeter_mean, area_mean, radius_worst, perimeter_worst, area_worst
* radius_se, perimeter_se, area_se
* compactness_mean, concavity_mean, concave points_mean
* texture_mean, texture_worst
<br>

& many more, but, majority of the features from above are going to be removed as they have strong correlation with each other.<br>
Hence, we can derive one feature from another, so, it is of no use to keep all the strongly correlated features.

We will only keep 4 features from the above 4 points (1 for each) randomly. We will keep:
* area_mean (It has got more correlations than radius_mean)
* area_se
* concave points_mean
* texture_mean

In [None]:
drop_features = ['radius_mean', 'perimeter_mean', 'radius_worst', 'perimeter_worst',
                 'area_worst', 'radius_se', 'perimeter_se', 'compactness_mean', 'concave points_worst', 'smoothness_worst',
                 'concavity_mean', 'compactness_worst', 'texture_worst']

modified_bc_df = bc_df.drop(columns=drop_features)
modified_corr_matrix = modified_bc_df.corr()

fig, axs = plt.subplots(figsize=(20, 10))
sns.heatmap(modified_corr_matrix, annot=True)

# Model-Selection:

In [None]:
modified_bc_df.columns

In [None]:
X = modified_bc_df.drop(columns='diagnosis')
y = modified_bc_df.diagnosis

X_train, X_test_cv, y_train, y_test_cv = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_test_cv, y_test_cv, test_size=0.5)

In [None]:
for name, x, y in (('Train', X_train, y_train), ('Validation', X_val, y_val), ('Test', X_test, y_test)):
    print(f'{name}:')
    print(f'Data: {x.shape}')
    print(f'Target: {y.shape}')
    print('')

## PCA:

In [None]:
X_train_scaled = StandardScaler().fit_transform(X_train)
pca = PCA()
pca.fit(X_train_scaled)
cumsum = np.cumsum(pca.explained_variance_ratio_)
print(cumsum)

plt.figure(figsize=(10,7))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 30, 0, 1])
plt.xlabel("n_components")
plt.ylabel("Explained Variance")
plt.grid(True)

We can see, that for to get explained variance >= 95%, we need n_componenets >= 8

## Main Pipeline:

In [None]:
main_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('model', None),
])

In [None]:
rnd_state = 4

params_grid = [
    {
        "pca__n_components": [8, 9, 10, 11, 12],
        "model": [XGBClassifier()],
        "model__n_estimators": [100, 500, 1000],
        "model__learning_rate": [0.005, 0.01, 0.05, 0.1],
        "model__max_depth": [3, 4, 5, 6, 10],
        "model__gamma": [0, 1, 5],
        "model__random_state": [rnd_state],
    },
    {
        "pca__n_components": [8, 9, 10, 11, 12],
        "model": [GradientBoostingClassifier()],
        "model__n_estimators": [100, 1000],
        "model__learning_rate": [0.001, 0.05, 0.1, 0.5],
        "model__random_state": [rnd_state],
    },
    {
        "pca__n_components": [8, 9, 10, 11, 12],
        "model": [RandomForestClassifier()],
        "model__n_estimators": [100, 1000],
        "model__max_depth": [3, 4, 5, 6, 10, 15, 20],
        "model__random_state": [rnd_state],
    },
    {
        "pca__n_components": [8, 9, 10, 11, 12],
        "model": [LGBMClassifier()],
        "model__n_estimators": [100, 500, 1000],
        "model__learning_rate": [0.005, 0.01, 0.05, 0.1],
        "model__max_depth": [3, 4, 5, 6, 10, -1],
        "model__random_state": [rnd_state],
    },
    {
        "pca__n_components": [8, 9, 10, 11, 12],
        "model": [CatBoostClassifier()],
        "model__n_estimators": [100, 500, 1000],
        "model__learning_rate": [0.005, 0.01, 0.05, 0.1],
        "model__max_depth": [3, 4, 5, 6, 10],
        "model__random_state": [rnd_state],
    },
]

In [None]:
main_grid = GridSearchCV(main_pipeline, params_grid, cv=3, verbose=2, scoring="accuracy")

In [None]:
main_grid.fit(X_train, y_train)

In [None]:
main_grid.best_estimator_

In [None]:
y_pred = main_grid.best_estimator_.predict(X_val)
print(metrics.classification_report(y_val, y_pred))
metrics.plot_roc_curve(main_grid.best_estimator_, X_val, y_val);

In [None]:
conf_matrix = metrics.confusion_matrix(y_val, y_pred)
plot_confusion_matrix(conf_mat=conf_matrix);

## Trying to increase model accuracy using Soft Vote Ensembling:

In [None]:
best_params = main_grid.best_params_
best_params

In [None]:
best_pipeline = clone(main_grid.best_estimator_)

best_pipeline.fit(X_train, y_train)

In [None]:
y_pred = best_pipeline.predict(X_val)
print(metrics.classification_report(y_val, y_pred))
metrics.plot_roc_curve(best_pipeline, X_val, y_val);

In [None]:
conf_matrix = metrics.confusion_matrix(y_val, y_pred)
plot_confusion_matrix(conf_mat=conf_matrix);

Creating second CatBoostClassifier Pipeline for VotingEnsemble:

In [None]:
second_pipeline = Pipeline([
                ("pca", PCA()),
                ("cat", CatBoostClassifier()),
])

In [None]:
second_param_grid = [
        {
            "pca__n_components": [8, 9, 10, 11, 12],
            "cat__n_estimators": [100, 500, 1000],
            "cat__learning_rate": [0.005, 0.01, 0.05, 0.1],
            "cat__max_depth": [3, 4, 5, 6, 10],
            "cat__random_state": [rnd_state],
        }
]

In [None]:
main_grid_2 = GridSearchCV(second_pipeline, second_param_grid, cv=3, verbose=2, scoring='accuracy')
main_grid_2.fit(X_train, y_train)

In [None]:
main_grid_2.best_estimator_

In [None]:
best_params_2 = main_grid_2.best_params_
best_params_2

In [None]:
y_pred = main_grid_2.best_estimator_.predict(X_val)
print(metrics.classification_report(y_val, y_pred))
metrics.plot_roc_curve(main_grid_2.best_estimator_, X_val, y_val);

Let's combine the two estimators using Soft Voting Ensemble:

In [None]:
best_pipeline_2 = clone(main_grid_2.best_estimator_)

In [None]:
softVoter = VotingClassifier(estimators=[
        ('cat_1', best_pipeline), ('cat_2', best_pipeline_2)], voting="soft")

In [None]:
softVoter.fit(X_train, y_train)

In [None]:
y_pred = softVoter.predict(X_val)
print(metrics.classification_report(y_val, y_pred))
metrics.plot_roc_curve(softVoter, X_val, y_val);

In [None]:
y_pred_test = softVoter.predict(X_test)
print(metrics.classification_report(y_test, y_pred_test))
metrics.plot_roc_curve(softVoter, X_test, y_test);

### Thank You!!