# Breast Cancer Analysis And Prediction

![](https://miro.medium.com/max/1400/1*pxFCmhRFTighUn88baLcSA.png)

Resources

1. <a href="https://www.kaggle.com/vincentlugat/breast-cancer-analysis-and-prediction#Breast-Cancer-Analysis-and-Prediction">Notebook</a>
2. <a href="https://medium.com/analytics-vidhya/breast-cancer-diagnostic-dataset-eda-fa0de80f15bd">Medium</a>


## Attribute Information:

---

* 1) ID number
* 2) Diagnosis (M = malignant, B = benign)
* 3-32)

Ten real-valued features are computed for each cell nucleus:

* a) radius (mean of distances from center to points on the perimeter)
* b) texture (standard deviation of gray-scale values)
* c) perimeter
* d) area
* e) smoothness (local variation in radius lengths)
* f) compactness (perimeter^2 / area - 1.0)
* g) concavity (severity of concave portions of the contour)
* h) concave points (number of concave portions of the contour)
* i) symmetry
* j) fractal dimension ("coastline approximation" - 1)

The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

# Importing Libraries

---

In [None]:
# File operation
import os
import numpy as np
import pandas as pd
import missingno as msno

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix

# Modelling
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
base_dir = "../input/breast-cancer-wisconsin-data/data.csv"

In [None]:
df = pd.read_csv(base_dir)

# Understanding The Data

---

In [None]:
df.head().T

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Check the missing values

msno.matrix(df, figsize=(15,5));

In [None]:
# Drop unnecessary columns

df = df.drop(["id", "Unnamed: 32"], axis=1)

In [None]:
df.diagnosis.value_counts().plot(kind="bar", 
                                 title="Counts of Diagnosis Types", 
                                 xlabel="Type", 
                                 ylabel="Count", 
                                 colormap="YlGn_r");

In [None]:
# Splitting columns by fields for better analysis

mean_columns = df.iloc[:, 1:11]
se_columns = df.iloc[:, 11: 21]
worst_columns = df.iloc[:, 21:31]

In [None]:
mean_columns.describe().T

In [None]:
se_columns.describe().T

In [None]:
worst_columns.describe().T

In [None]:
fields = ["mean", "se", "worst"]
columns = ["radius", "texture", "perimeter", "area", "smoothness", "compactness", "concavity", "concave points", "symmetry", "fractal_dimension"]

# Relationship Between Features & Diagnosis

---

In [None]:
fig, axs = plt.subplots(10,3, figsize=(25,45))

for col in range(len(columns)):
    
    for f in range(len(fields)):
        
        sns.histplot(df, 
                     x=columns[col]+"_"+fields[f], 
                     hue="diagnosis", element="poly", 
                     stat="count", 
                     palette=["#7158e2", "#32ff7e"],
                     ax=axs[col][f])

# Relationship Between Features & Features

---

In [None]:
def heat(data, color, size):
    
    corr = data.corr()
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(corr)] = True
    
    plt.figure(figsize=size)
    sns.heatmap(corr, mask=mask, annot=True, cmap=color)
    plt.show()

In [None]:
heat(mean_columns, "Reds", (10,8))

In [None]:
heat(se_columns, "Greens", (10,8))

In [None]:
heat(worst_columns, "Blues", (10,8))

### With heatmaps, we can see correlation between this columns:
---

* *perimeter - radius*
* *area - radius*
* *area - perimeter*
* *concavity - compactness*
* *fractal_dimension - compactness*
* *concave points - compactness*
* *concave points - concavity*
* *concave points - radius*
* *concave points - perimeter*
* *concave points - area*

In [None]:
def scatter_columns(feature1, feature2, title):
    
    fig = go.Figure()
    fig.update_layout(
        title=title,
        width=600,
        height=400,
        margin=dict(
                    l=20,
                    r=20,
                    t=40,
                    b=20,
                )
    )
    
    fig.add_trace(go.Scatter(x=df[feature1+"_"+fields[0]], 
                             y=df[feature2+"_"+fields[0]], 
                             mode="markers", 
                             name="mean",
                             ))

    fig.add_trace(go.Scatter(x=df[feature1+"_"+fields[1]], 
                             y=df[feature2+"_"+fields[1]], 
                             mode="markers", 
                             name="se",
                             ))

    fig.add_trace(go.Scatter(x=df[feature1+"_"+fields[2]], 
                             y=df[feature2+"_"+fields[2]], 
                             mode="markers", 
                             name="worst",
                             ))
    fig.show()

In [None]:
scatter_columns("perimeter", "radius", "Perimeter & Radius")

In [None]:
scatter_columns("area", "radius", "Area & Radius")

In [None]:
scatter_columns("area", "perimeter", "Area & Perimeter")

In [None]:
scatter_columns("concavity", "compactness", "Concavity & Compactless")

In [None]:
scatter_columns("fractal_dimension", "compactness", "Fractal Dimension & Compactness")

In [None]:
cols = ["radius", "perimeter", "area", "compactness", "concavity"]

fig = make_subplots(rows=5, cols=1, 
                    subplot_titles=[ "Radius & Concave Points",
                                     "Perimeter & Concave Points",
                                     "Area & Concave Points",
                                     "Compactness & Concave Points",
                                     "Concavity & Concave Points",
                                    ])
    
for i in range(len(cols)):
    
    fig.update_layout(
    width=900,
    height=1600,
    margin=dict(
                l=40,
                r=40,
                t=30,
                b=0,
            )
    )
    
    fig.update_xaxes(title_text=cols[i], row=i+1, col=1)
    fig.update_yaxes(title_text="concave points", row=i+1, col=1)
    
    fig.add_trace(go.Scatter(x=df[cols[i]+"_"+fields[0]],
                             y=df["concave points"+"_"+fields[0]], 
                             mode="markers", 
                             name="mean",
                             ),
                  row=i+1,
                  col=1,
                 )

    fig.add_trace(go.Scatter(x=df[cols[i]+"_"+fields[1]], 
                             y=df["concave points"+"_"+fields[1]], 
                             mode="markers", 
                             name="se",
                             ),
                 
                  row=i+1,
                  col=1,
                 )

    fig.add_trace(go.Scatter(x=df[cols[i]+"_"+fields[2]], 
                             y=df["concave points"+"_"+fields[2]], 
                             mode="markers", 
                             name="worst",
                            ),
                 
                  row=i+1,
                  col=1,
                 )
                             
fig.show()

# Preprocessing

---

In [None]:
df.head()

### From Outliers to Mean

---

*We can transform outlier to the mean of each column for better modeling*

In [None]:
for col in range(len(columns)):
    
    for f in range(len(fields)):
        
        Q1 = df[columns[col]+"_"+fields[f]].quantile(0.25)
        Q3 = df[columns[col]+"_"+fields[f]].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5*IQR
        upper_bound = Q3 + 1.5*IQR
        
        outliers = (df[columns[col]+"_"+fields[f]] < lower_bound) | (df[columns[col]+"_"+fields[f]] > upper_bound)
        df[columns[col]+"_"+fields[f]][outliers] = df[columns[col]+"_"+fields[f]].mean()

### Split The Data

---

In [None]:
x = df.drop("diagnosis", axis=1)   # our feautures
y = df.diagnosis                   # our label

In [None]:
# For Diagnosis column, we have to transform benign-malign to 0-1 for better modeling

y = pd.get_dummies(y)
y = y.drop("B", axis=1)  # we dropping the "B" column because we dont need it. If a label is Benign, then "M" column will be 0.

In [None]:
x.head()

In [None]:
y.head()

### Standard Scaler

---

> Standardize features by removing the mean and scaling to unit variance

In [None]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

### Train-Test Split

---

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
print(f"x_train: {x_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

In [None]:
x_train

# Modeling

---

In [None]:
!pip install lightgbm
!pip install xgboost

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
algorithms = [LogisticRegression, 
              RidgeClassifier, 
              SVC, 
              LinearSVC, 
              RandomForestClassifier,
              KNeighborsClassifier,
              GaussianNB, 
              Perceptron, 
              SGDClassifier, 
              DecisionTreeClassifier,
              AdaBoostClassifier,
              GradientBoostingClassifier,
              LGBMClassifier,
              XGBClassifier,
             ]

In [None]:
df_algorithms = pd.DataFrame(columns=["Model", "Train Accuracy", "Test Accuracy"])

In [None]:
def autoML(algorithm): 

    model = algorithm().fit(x_train, y_train)
    train_acc = model.score(x_train, y_train)
    model_name = algorithm.__name__
    
    y_pred = model.predict(x_test)
    test_acc = accuracy_score(y_test, y_pred)
    
    return model_name, train_acc, test_acc

In [None]:
for alg in algorithms:
    
    model_name, train_acc, test_acc = autoML(alg)
    
    df_algorithms = df_algorithms.append({"Model" : model_name,
                                          "Train Accuracy": train_acc,
                                          "Test Accuracy": test_acc}, ignore_index=True)

In [None]:
df_algorithms.sort_values(by=["Test Accuracy", "Train Accuracy"],ascending=False)

# Tuning

---

In [None]:
model = LinearSVC()

In [None]:
parameters = {"penalty": ["l1","l2"],
              "loss": ["hinge", "squared_hinge"], 
              "C": [0.001, 0.01, 0.1, 1, 10],
              "multi_class": ["ovr", "crammer_singer"], 
              }

In [None]:
cv_model = GridSearchCV(model, 
                        parameters, 
                        cv=5, 
                        n_jobs=-1

                       ).fit(x_train, y_train)

In [None]:
cv_model.best_params_

In [None]:
tuned_model = LinearSVC(C = 0.01, 
                        loss = "hinge",
                        multi_class = "crammer_singer", 
                        penalty = "l1"

                        ).fit(x_train, y_train)

In [None]:
y_pred = tuned_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_pred, y_test))

In [None]:
plot_confusion_matrix(tuned_model,
                      x_test,
                      y_test,
                      display_labels=["Benign", "Malign"],
                      cmap=plt.cm.Blues,
                     )

### Thanks for reading! Hope this notebook helped