# <div style="text-align: Left"><span style="color:#67636b; font-family:Georgia;">Exploratory Data Analysis and Machine Learning Models on Breast Cancer Dataset</span></div>

<div style="text-align: justify"><span style="color:#000000; font-family:Georgia; font-size:1.2em;"><b>
    Feature Description</b></span></div>

<div style="text-align: justify"><span style="color:#000000; font-family:Georgia; font-size:1.2em;">
   The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features.  </span></div>
&nbsp;


<div style="text-align: justify"><table style="width:80%">
  <tr>
    <th align= "left">Features</th>
    <th align= "left">Description</th>
  </tr>

  <tr>
    <td>ID number </td>
    <td></td>
  </tr>
  <tr>
    <td>Diagnosis</td>
    <td> (M = malignant, B = benign)</td>
  </tr>
  <tr>
    <td>radius</td>
    <td>mean of distances from center to points on the perimeter</td>
  </tr>
  <tr>
    <td>texture</td>
    <td>standard deviation of gray-scale values</td>
  </tr>
  <tr>
    <td>perimeter</td>
    <td></td>
  </tr>
  <tr>
    <td>area</td>
    <td></td>
  </tr>
  <tr>
    <td>smoothness</td>
    <td>local variation in radius lengths</td>
  </tr>
  <tr>
    <td>compactness</td>
    <td>(perimeter^2 / area - 1.0)</td>
  </tr>
  <tr>
    <td>concavity</td>
    <td>severity of concave portions of the contour</td>
  </tr>
  <tr>
    <td>concave points</td>
    <td>number of concave portions of the contour</td>
  </tr>
  <tr>
    <td>symmetry</td>
    <td></td>
  </tr>
  <tr>
    <td>fractal dimension</td>
    <td> ("coastline approximation" - 1)</td>
  </tr>
</table></div>

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.diagnosis.unique()

The diagnosis column has only two values to it. M - for malignant and B - for Benign. We will use this column as our target column/ output column to build our machine learning model.

In [None]:
data.columns

In [None]:
data.isna().sum()

Since, unnamed column has all 569 entries as nan so we do not need this column.

In [None]:
data['Unnamed: 32'].unique()

In [None]:
data.drop('Unnamed: 32', axis = 'columns', inplace = True)

Making a copy of the dataset to perform Exploratory Data Analysis

In [None]:
data_copy = data.copy(deep=True)

In [None]:
y = data.diagnosis
data.drop(['diagnosis', 'id'], axis = 'columns', inplace = True)

In [None]:
data.head()

## Performing Exploratory Data Analysis

In [None]:
data_copy.drop(['id'], axis = 'columns' , inplace = True)
data_copy.head()

In [None]:
plt.figure(figsize = (20,10))
sns.set_theme(style="darkgrid")

radius = data_copy[['radius_mean','radius_se','radius_worst','diagnosis']]
sns.pairplot(radius, hue='diagnosis', markers=["o", "s"])

In [None]:
area = data_copy[['area_mean','area_se','area_worst','diagnosis']]
sns.pairplot(area, hue='diagnosis', markers=["o", "s"])

In [None]:
perimeter = data_copy[['perimeter_mean','perimeter_se','perimeter_worst','diagnosis']]
sns.pairplot(perimeter, hue='diagnosis', markers=["o", "s"])

In [None]:
texture = data_copy[['texture_mean','texture_se','texture_worst','diagnosis']]
sns.pairplot(texture, hue='diagnosis', markers=["o", "s"])

In [None]:
compactness = data_copy[['compactness_mean','compactness_se','compactness_worst','diagnosis']]
sns.pairplot(compactness, hue='diagnosis', markers=["o", "s"])

In [None]:
concavity = data_copy[['concavity_mean','concavity_se','concavity_worst','diagnosis']]
sns.pairplot(concavity, hue='diagnosis', markers=["o", "s"])

In [None]:
symmetry = data_copy[['symmetry_mean','symmetry_se','symmetry_worst','diagnosis']]
sns.pairplot(symmetry, hue='diagnosis', markers=["o", "s"])

In [None]:
fractal_dimension = data_copy[['fractal_dimension_mean','fractal_dimension_se','fractal_dimension_worst','diagnosis']]
sns.pairplot(fractal_dimension, hue='diagnosis', markers=["o", "s"])

In [None]:
smoothness = data_copy[['smoothness_mean','smoothness_se','smoothness_worst','diagnosis']]
sns.pairplot(smoothness, hue='diagnosis', markers=["o", "s"])

In [None]:
ax = sns.countplot(y)
Benign, Malignant = y.value_counts(normalize = True)
print(f'The percentage of Benign case is : {Benign*100}\n\n')
print(f'The percentage of Malignant case is : {Malignant*100}\n\n')

In [None]:
y=pd.DataFrame(y)
y

## Feature Selection

In [None]:
plt.figure(figsize=(20,20))
mask = np.triu(np.ones_like(data.corr()))
sns.heatmap(data.corr(), cmap="Blues", annot=True, mask=mask)

Eliminating those columns that have correlation of more than 0.9

In [None]:
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
                columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]

In [None]:
columns

In [None]:
data.shape

We can see that the column number have reduced from 30 to 20.

In [None]:
fig = plt.figure(figsize = (20, 25))
j = 0
for i in data.columns:
    plt.subplot(6, 4, j+1)
    j += 1
    sns.distplot(data[i][y['diagnosis']=='B'], color='#86994d', label = 'Benign')
    sns.distplot(data[i][y['diagnosis']=='M'], color='#ed6a4a', label = 'Malignant')
    plt.legend(loc='best')
fig.suptitle('Breast Cancer Data Analysis')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()

In [None]:
data.head()

### We can see that there are some features that have to be normalized before we can use them for building our model.

First, we will encode the diagnosis column of the y dataset.

In [None]:
le = LabelEncoder()
y.diagnosis = le.fit_transform(y.diagnosis)
y.head()

We see that Malignant has been encoded as 1, while Benign has been encoded as 0.

### Now, we will scale the columns so that they can be used to build our model.

In [None]:
min_max_scaler = MinMaxScaler()
data[["radius_mean", "texture_mean"]] = min_max_scaler.fit_transform(
    data[["radius_mean", "texture_mean"]])
data.head()

## Splitting our dataset into training and testing samples.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

### We will try logistic regression, Random forest classifier, Decision tree classifier and support vector classifier.
We will define a function named model_build to train the models and print the classification report.

In [None]:
def model_build(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    print("CLASSIFICATION REPORT CHART: ","\n\n",classification_report(y_test, y_predicted),"\n")
    print("CONFUSION MATRIX","\n\n",confusion_matrix(y_test,y_predicted),"\n")
    print('ROC-AUC: ',roc_auc_score(y_test,y_prob[:,1]),"\n")
    print("TOTAL ACCURACY IN TRAINING: ","\n",model.score(X_train,y_train),"\n")
    print("TOTAL ACCURACY IN TESTING: ","\n",model.score(X_test,y_test),"\n")
    
    plt.figure(figsize = (15,10))
    fpr, tpr, thresholds = roc_curve(y_test, y_prob[:,1])
    plt.subplot(2, 2, 1)
    plt.plot(fpr, tpr, label='ROC curve')
    plt.plot([0, 1], [0, 1], 'g--', label='Random guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC (Receiver operating characteristic) Curve')
    plt.legend()
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_prob[:,1])
    plt.subplot(2, 2, 2)
    plt.title("Precision-Recall-F1 vs Threshold Chart")
    plt.plot(thresholds, precision[: -1], "b--", label="Precision")
    plt.plot(thresholds, recall[: -1], "r--", label="Recall")
    plt.plot(thresholds, (2*precision[:-1]*recall[:-1])/(precision[:-1]+recall[:-1]), "g--", label="F1_score")
    plt.ylabel("Precision, Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower left")
    plt.show()
    
    return y_predicted, y_prob

## Logistic Regression Model

In [None]:
model_lr = LogisticRegression()
y_predicted_lr,y_prob_lr = model_build(model_lr, X_train, y_train, X_test)

## Random Forest Classifier

In [None]:
model_rfc = RandomForestClassifier(criterion = 'gini')
y_predicted_rfc,y_prob_rfc = model_build(model_rfc, X_train, y_train, X_test)

## Decision Tree Classifier

In [None]:
model_dtc = DecisionTreeClassifier()
y_predicted_dtc,y_prob_dtc = model_build(model_dtc, X_train, y_train, X_test)

## Support Vector Classifier

In [None]:
model_svc = SVC(probability=True)
y_predicted_svc,y_prob_svc = model_build(model_svc, X_train, y_train, X_test)

## Performing K-fold Cross Validation to find which model has higher accuracy

In [None]:
score_lr = cross_val_score(model_lr, data, y, cv = 10)
print(f'The average score for Logistic Regression classifier is: {np.average(score_lr)}')

In [None]:
score_rfc = cross_val_score(model_rfc, data, y, cv = 10)
print(f'The average score for Random Forest classifier is: {np.average(score_rfc)}')

In [None]:
score_dtc = cross_val_score(model_dtc, data, y, cv = 10)
print(f'The average score for Decision Tree classifier is: {np.average(score_dtc)}')

In [None]:
score_svc = cross_val_score(model_svc, data, y, cv = 10)
print(f'The average score for Support Vector classifier is: {np.average(score_svc)}')

We see that, out of all the models, Random forest classifier performs the best.