In [None]:
# Importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Extracting the data
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print('Keys :', cancer.keys())
print('Type :', type(cancer))

Keys : dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
Type : <class 'sklearn.utils._bunch.Bunch'>


In [None]:
# EDA
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

In [None]:
# Seperating input and output data
X = cancer.data
y = cancer.target

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)

In [None]:
# Creating a model
model = AdaBoostClassifier(n_estimators = 100, learning_rate = 0.001)

# Fitting the model
model.fit(X_train, y_train)

In [None]:
# Taking predictions from the model on training and testing data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
# Getting the performance
print('Training Accuracy :', np.round(metrics.accuracy_score(y_train, y_train_pred), 3))
print('Training Precision :', np.round(metrics.precision_score(y_train, y_train_pred), 3))
print('Training Recall :', np.round(metrics.recall_score(y_train, y_train_pred), 3))
print('Training F1 Score :', np.round(metrics.f1_score(y_train, y_train_pred), 3))

print('\nTesting Accuracy :', np.round(metrics.accuracy_score(y_test, y_test_pred), 3))
print('Testing Precision :', np.round(metrics.precision_score(y_test, y_test_pred), 3))
print('Testing Recall :', np.round(metrics.recall_score(y_test, y_test_pred), 3))
print('Testing F1 Score :', np.round(metrics.f1_score(y_test, y_test_pred), 3))

Training Accuracy : 0.941
Training Precision : 0.937
Training Recall : 0.973
Training F1 Score : 0.954

Testing Accuracy : 0.956
Testing Precision : 0.93
Testing Recall : 1.0
Testing F1 Score : 0.964


In [None]:
'''
Comment -> The above model looks optimal
as the performance on training and testing data is
(A) Above 90%
(B) Within 5% of each other
'''