# Breast Cancer Prediction

**Task**|

Predict `diagnosis` (M = malignant, B = benign)

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline


import warnings
warnings.filterwarnings("ignore")

## Data Overview

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv", index_col = 'id')
df.head()

In [None]:
df.tail()

In [None]:
df.info()

## Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.diagnosis.unique()

* There is one categorical and 32 numeric columns;
* Column `diagnosis` is categorical. It's the target column and has class labels M and B (M = malignant, B = benign).

Find Missing Values (NaNs):

In [None]:
df.isna().any()

In [None]:
df['Unnamed: 32'].isna().count()

`Unnamed: 32` column has only null values. So it's useless column. It should be dropped.

In [None]:
df.describe()

In [None]:
df.describe(include=['object'])

In [None]:
df['diagnosis'].value_counts()

## Preparing Data Set

In [None]:
df = df.drop('Unnamed: 32', axis=1)

In [None]:
df['diagnosis'] = df['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

In [None]:
df.head()

## Data Visualization

In [None]:
sns.countplot(x='diagnosis', data=df, palette='pastel')
plt.title('Breast Cancer Diagnosis')
plt.grid(axis='y')

**Features histograms**

In [None]:
for feature in df.columns:
    fig = px.histogram(df, x = feature)
    fig.show()

**Correlation matrix**

In [None]:
corr = df.corr()
plt.figure(figsize=(30,20));
sns.heatmap(corr, annot=True, fmt='.2f');

Finding highly correlated features

In [None]:
CorField = []
for i in corr:
    for j in corr.index[corr[i] > 0.75]:
        if i != j and j not in CorField and i not in CorField:
            CorField.append(j)
            print (i, j, corr[i][corr.index == j].values[0])

In [None]:
threshold = 0.75
filter_features = np.abs(corr["diagnosis"]) > threshold
corr_features = corr.columns[filter_features].tolist()
sns.heatmap(df[corr_features].corr(),annot=True,fmt=".2f");
plt.title("Correlation Between Features w 0.75 Threshold");
plt.show();

**Pairplots of highly correlated features**

In [None]:
sns.pairplot(df[corr_features], hue="diagnosis")
plt.show();

We can see that thees features are linearly dependent

**Boxplots**

In [None]:
plt.figure(figsize=(20,35))
plotnumber =1
for column in df.columns[1:]:
    ax = plt.subplot(10,3,plotnumber)
    sns.boxplot(data = df, x = column, palette='pastel')
    plt.xlabel(column)
    plotnumber+=1
plt.show()

There are a lot of outliers in our data

**Violinplots**

Spliting data(mean, standard error and "worst")

In [None]:
mean_list =[]
se_list =[]
worst_list =[]
for i in df.columns:
    f_list = i.split('_')
    if f_list[-1] == 'mean':
        mean_list.append(i)
    elif f_list[-1] == 'se':
        se_list.append(i)
    elif f_list[-1] == 'worst':
        worst_list.append(i)    

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
feat_scaled = pd.DataFrame(scaler.fit_transform(df[mean_list]),columns=mean_list, index = df.index)
data = pd.concat([df['diagnosis'],feat_scaled],axis=1)
df_melt = pd.melt(frame=data, value_vars=mean_list, id_vars=['diagnosis'])
fig, ax = plt.subplots(1, 1, figsize = (20, 8), dpi=300)
sns.violinplot(x="variable",y="value",hue = "diagnosis",data=df_melt,split = True, inner="quart",palette='pastel')

In [None]:
feat_scaled = pd.DataFrame(scaler.fit_transform(df[se_list]),columns=se_list, index = df.index)
data = pd.concat([df['diagnosis'],feat_scaled],axis=1)
df_melt = pd.melt(frame=data, value_vars=se_list, id_vars=['diagnosis'])
fig, ax = plt.subplots(1, 1, figsize = (20, 8), dpi=300)
sns.violinplot(x="variable",y="value",hue = "diagnosis",data=df_melt,split = True,  inner="quart", palette='pastel')

In [None]:
feat_scaled = pd.DataFrame(scaler.fit_transform(df[worst_list]),columns=worst_list, index = df.index)
data = pd.concat([df['diagnosis'],feat_scaled],axis=1)
df_melt = pd.melt(frame=data, value_vars=worst_list, id_vars=['diagnosis'])
fig, ax = plt.subplots(1, 1, figsize = (20, 8), dpi=300)
sns.violinplot(x="variable",y="value",hue = "diagnosis",data=df_melt,split = True,  inner="quart", palette='pastel')

## Data Preparation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['diagnosis'], axis=1), df['diagnosis'], test_size=0.3, random_state=42, stratify=df['diagnosis']
)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Predictive Model 

#### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = knn.predict_proba(X_test_scaled)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.4f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### Grid-Search

In [None]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
        'n_neighbors': range(1, 100),
        'p': range(1, 10)
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_scaled, y_train)

print("Best CV score: {:.3f}, best CV n_neighbors: {}, best CV p: {}".format(
    grid_search.best_score_, grid_search.best_estimator_.n_neighbors, grid_search.best_estimator_.p)
) 


test_predictions = grid_search.best_estimator_.predict(X_test_scaled)
print("Resulting test score: {:.3f}".format(f1_score(test_predictions, y_test)))

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, p=1)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = knn.predict_proba(X_test_scaled)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.4f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
param_grid = {
        'C':range(1, 200),
}

clf_lr = LogisticRegression()

grid_search = GridSearchCV(clf_lr, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_scaled, y_train)

print("Best CV score: {:.3f}, best CV C: {}".format(
    grid_search.best_score_, grid_search.best_estimator_.C)
) 


test_predictions = grid_search.best_estimator_.predict(X_test_scaled)
print("Resulting test score: {:.3f}".format(f1_score(test_predictions, y_test)))

In [None]:
clf_lr = LogisticRegression(C=2, random_state=42, max_iter = 1000)
clf_lr.fit(X_train_scaled, y_train)

y_predicted_lr = clf_lr.predict(X_test_scaled)
print(classification_report(y_test, y_predicted_lr, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, y_predicted_lr)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = clf_lr.predict_proba(X_test_scaled)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.4f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Droping highly correlated features**

In [None]:
corr_features = []
for i in corr:
    for j in corr.index[corr[i] > 0.85]:
        if i != j and j not in corr_features and i not in corr_features:
            corr_features.append(j)
corr_features

In [None]:
X_train_log = X_train.drop(corr_features, axis=1)
X_test_log = X_test.drop(corr_features, axis=1)

In [None]:
param_grid = {
        'C':range(1, 200),
}

clf_lr = LogisticRegression()

grid_search = GridSearchCV(clf_lr, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_log, y_train)

print("Best CV score: {:.3f}, best CV C: {}".format(
    grid_search.best_score_, grid_search.best_estimator_.C)
) 


test_predictions = grid_search.best_estimator_.predict(X_test_log)
print("Resulting test score: {:.3f}".format(f1_score(test_predictions, y_test)))

In [None]:
clf_lr = LogisticRegression(C=159, random_state=42, max_iter = 1000)
clf_lr.fit(X_train_log, y_train)

y_predicted_lr = clf_lr.predict(X_test_log)
print(classification_report(y_test, y_predicted_lr, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, y_predicted_lr)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = clf_lr.predict_proba(X_test_log)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.4f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Without highly correlated features and with Standardization**

In [None]:
scaler = StandardScaler()

X_train_log_scaled = scaler.fit_transform(X_train_log)
X_test_log_scaled = scaler.transform(X_test_log)

In [None]:
param_grid = {
        'C':range(1, 200),
}

clf_lr = LogisticRegression()

grid_search = GridSearchCV(clf_lr, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_log_scaled, y_train)

print("Best CV score: {:.3f}, best CV C: {}".format(
    grid_search.best_score_, grid_search.best_estimator_.C)
) 


test_predictions = grid_search.best_estimator_.predict(X_test_log_scaled)
print("Resulting test score: {:.3f}".format(f1_score(test_predictions, y_test)))

In [None]:
clf_lr = LogisticRegression(C=1, random_state=42, max_iter = 1000)
clf_lr.fit(X_train_log_scaled, y_train)

y_predicted_lr = clf_lr.predict(X_test_log_scaled)
print(classification_report(y_test, y_predicted_lr, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, y_predicted_lr)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = clf_lr.predict_proba(X_test_log_scaled)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.4f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Conclusion

Model with the best score is LogisticRegression  
Achieved roc-auc: 0.99