## Dataset Attributes Information

<ul>
    <li>ID number</li>
    <li>Diagnosis (M = malignant, B = benign) (Target Variable)</li>
    <li>radius (mean of distances from center to points on the perimeter)</li>
    <li>texture (standard deviation of gray-scale values)</li>
    <li>perimeter</li>
    <li>area</li>
    <li>smoothness (local variation in radius lengths)</li>
    <li>compactness (perimeter^2 / area - 1.0)</li>
    <li>concavity (severity of concave portions of the contour)</li>
    <li>concave points (number of concave portions of the contour)</li>
    <li>symmetry</li>
    <li>fractal dimension ("coastline approximation" - 1)</li>
</ul>

In [None]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, plot_confusion_matrix, f1_score, recall_score

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
DATA_PATH = "../input/breast-cancer-wisconsin-data/data.csv"

In [None]:
data  = pd.read_csv(DATA_PATH)
data.head()


## Exploring the Data

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
len(data.id.unique())

<div style="background-color:lightgreen;color:black;padding:10px;border-radius:10px;">
<b>Observation:</b> We can notice that all the ids are unique and the number of ids are equal to the number of rows .i.e there is no duplicacy in the data. Let us drop the id's column.
</div>

In [None]:
data.isnull().sum()

In [None]:
data.drop(columns = ["id", "Unnamed: 32"], inplace = True)
data.head()

## Univariate Analysis

<b>Checking for Imbalance in data</b>

In [None]:
counts = data["diagnosis"].value_counts()
diag_cols = ["B", "M"]
diag_counts = [counts[0], counts[1]]

benign = (diag_counts[0] / sum(diag_counts))*100
malignant = (diag_counts[1] / sum(diag_counts)) * 100

print(f"Benign: {benign}%")
print(f"Malignant: {malignant}%")

print()

plt.figure(figsize = (10, 8))
sns.barplot(x = diag_cols, y = diag_counts)
plt.show()

<div style="background-color:lightgreen;color:black;padding:10px;border-radius:10px;">
<b>Observation:</b> Almost 37% of the total data belong to Malignant Class and 63% belong to Benign class. We can balance this data using Upsampling.
</div>

In [None]:
diag_map = {
    "M":1,
    "B":0
}

data["diagnosis"] = data["diagnosis"].map(diag_map).values.copy()

In [None]:
all_columns = list(data.columns)

In [None]:
plt.figure(figsize = (25,55))
cols = all_columns[1:]
for i in range(30):
    plt.subplot(10, 3, i+1)
    sns.distplot(data[cols[i]])
plt.show()

In [None]:
plt.figure(figsize = (25,55))
cols = all_columns[1:]
for i in range(30):
    plt.subplot(10, 3, i+1)
    sns.boxplot(data[cols[i]])
plt.show()

## Bivariate Analysis


In [None]:
plt.figure(figsize = (20,15))
sns.heatmap(data[all_columns[1:]].corr(), center = 0)
plt.show()

In [None]:
corr_matrix = data[all_columns[1:]].corr()
mask = corr_matrix >= 0.9
mask

In [None]:
correlated_cols = []
for column1 in all_columns[1:]:
    for column2 in all_columns[1:]:
        if corr_matrix[column1][column2] >= 0.9 and column1!=column2:
            pair = (column1, column2)
            if pair not in correlated_cols and pair[::-1] not in correlated_cols:
                correlated_cols.append(pair)

In [None]:
correlated_cols

<div style="background-color:lightgreen;color:black;padding:10px;border-radius:10px;">
<b>Observation:</b> We can notice that the above mentioned pairs are highly correlated with pearson correlation value >= 0.9. We can eliminate the columns that are highly correlated based on the correlation of the column with the target data. <br/>
The correlation between categorical and continuous data can be measured in the following ways:<br/>
1. Point biserial Correlation<br/>
2. Logistic Regression<br/>
3. Kruskal-Wallis H Test (Or parametric forms such as t-test or ANOVA)
</div>

## Using Logistic Regression for finding correlation between Continuous and Categorical Data


In [None]:
cols = list(set([col for i in correlated_cols for col in i]))
cols_acc = {}
for column in cols:
    X = data[column]
    y = data["diagnosis"]
    model = LogisticRegression()
    model.fit(X[:250].values.reshape(-1, 1), y[:250])
    accuracy = accuracy_score(y[250:], model.predict(X[250:].values.reshape(-1, 1)))
    cols_acc[column] = accuracy
cols_acc

In [None]:
imp_cols = []
for col in correlated_cols:
    scores = [cols_acc[col[0]], cols_acc[col[1]]]
    req_col = col[scores.index(max(scores))]
    if req_col not in imp_cols:
        imp_cols.append(req_col)
imp_cols

In [None]:
data.shape
df = data.copy()
df.shape

In [None]:
for col in cols:
    if col not in imp_cols:
        df.drop(columns=col, inplace = True)
df.shape

In [None]:
df.head()

In [None]:
X = df[df.columns[1:]]
y = df["diagnosis"]

## Splitting the Data for training and testing


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size = 0.2, random_state = 0)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Train Data: {X_test.shape}, {y_test.shape}")

## Upsampling using SMOTE


In [None]:
counter = Counter(y_train)
counter

In [None]:
upsample = SMOTE()
X_train, y_train = upsample.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

In [None]:
print(f"Total Data after Upsampling: {len(X_train)}")

In [None]:
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Train Data: {X_test.shape}, {y_test.shape}")

## KNN Classifier


In [None]:
error_rate = []
for i in range(1, 50):
    pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors = i))
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy at k = {i} is {accuracy}")
    error_rate.append(np.mean(predictions != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate))+1)

## SVM Classifier


In [None]:
svm_pipeline = make_pipeline(StandardScaler(), SVC(probability=True))
svm_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = svm_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on Test Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(y_test, predictions)}")
print(f"Recall Score: {recall_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
plot_confusion_matrix(svm_pipeline, X_test, y_test)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = svm_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
print(f"Accuracy on Whole Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(y, predictions)}")
print(f"Recall Score: {recall_score(y, predictions)}")
print(f"F1 Score: {f1_score(y, predictions)}")
plot_confusion_matrix(svm_pipeline, X.values, y)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## RandomForest Classifier


In [None]:
rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(random_state = 18))
rf_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = rf_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on Test Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(y_test, predictions)}")
print(f"Recall Score: {recall_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
plot_confusion_matrix(rf_pipeline, X_test, y_test)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = rf_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
print(f"Accuracy on Whole Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(y, predictions)}")
print(f"Recall Score: {recall_score(y, predictions)}")
print(f"F1 Score: {f1_score(y, predictions)}")
plot_confusion_matrix(rf_pipeline, X.values, y)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## XGBoost Classifier


In [None]:
xgb_pipeline = make_pipeline(StandardScaler(), XGBClassifier(random_state = 18))
xgb_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = xgb_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on Test Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(y_test, predictions)}")
print(f"Recall Score: {recall_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
plot_confusion_matrix(xgb_pipeline, X_test, y_test)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = xgb_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
print(f"Accuracy on Whole Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(y, predictions)}")
print(f"Recall Score: {recall_score(y, predictions)}")
print(f"F1 Score: {f1_score(y, predictions)}")
plot_confusion_matrix(xgb_pipeline, X.values, y)
plt.title("Confusion Matrix for Whole Data")
plt.show()


## Results

<div style="background-color:lightgreen;color:black;padding:10px;border-radius:10px;">
    
After performing extensive Exploratory Data Analysis, eliminating the problem of imbalance and multicollinearity and experimenting with different machine learning algorithms, XGBoost Classifier outperformed remaining algorithms.<br/>
<b>Performance Metrics of the best model i.e. XGBoost Model</b><br/>

<center>
<div align="center">
<center>
<table style="background-color:lightgreen;color:black;">
    <tr>
        <th colspan=4>XGB Classifier</th>
    </tr>
    <tr>
        <th colspan=4>On Test Data</th>
    </tr>
    <tr>
        <th>Accuracy</th>
        <th>Precision</th>
        <th>Recall</th>
        <th>F1 Score</th>
    </tr>
    <tr>
        <th>96.49%</th>
        <th>0.93</th>
        <th>0.97</th>
        <th>0.95</th>
    </tr>
    <tr>
        <th colspan=4>On Whole Data</th>
    </tr>
    <tr>
        <th>Accuracy</th>
        <th>Precision</th>
        <th>Recall</th>        
        <th>F1 Score</th>
    </tr>
    <tr>
        <th>99.29%</th>
        <th>0.98</th>
        <th>0.99</th>        
        <th>0.99</th>
    </tr>
</table>
    </center>
    </div>
    </center>
</div>