In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing libraries 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
BC_df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
print("The shape of our dataset :"+str(BC_df.shape))
BC_df.head()

In [None]:
#print all the columns of the dataset
print(BC_df.columns)

In [None]:
# drop the unnecessary columns as:id
BC_df.drop(columns={'id'},inplace = True)


## Get a statistical insight about our dataset

In [None]:
BC_df.describe()

In [None]:
BC_df.info()

## Check for the missing values and deal with them

In [None]:
BC_df.isnull().sum()

In [None]:
# visualizing the missing values using missingo
import missingno as msno
msno.matrix(BC_df)
plt.show()

#### **Note:**The column of the Unnamed: 32 has null values ; meanwhile the others don't have any null value , so we will drop the whole column(Unnamed: 32)

In [None]:
# drop the Unnamed: 32 column
BC_df.dropna(axis=1,inplace=True)

In [None]:
# print the shape of the dataset after dropping the Unnamed: 32 column
BC_df.shape

## Exploratory Data analysis

> ###### Our data has diagnosis as [label class] where ; 
* B -> Benign  
* M -> malignant

In [None]:
diagnosis_set = set(BC_df['diagnosis'])
print(diagnosis_set)

In [None]:
# Checking for imbalance 
BC_count = BC_df['diagnosis'].value_counts()
print(BC_count)
plt.figure(figsize=(10,5))
g = sns.countplot(BC_df['diagnosis'])
g.set_xticklabels(['Benign','Malignant'])
plt.show()

## Define the features(x) and class_label(y)

In [None]:
X = BC_df.drop(columns={'diagnosis'})
X

In [None]:
y = BC_df['diagnosis']
y

## Dealing imbalance dataset 
* Benign ->357
* malignant ->212

In [None]:
#using SMOTE from imblearn
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_res, y_res = smote.fit_resample(X,y)
print('Resampled dataset shape %s' % y_res.value_counts())



In [None]:
print(y_res)

## Get the data ready for training it via the model

In [None]:
enoder = LabelEncoder().fit(y_res)
y_res = enoder.transform(y_res)
# BC_df.head()
y_res
#Note that the value for M -> 1 | B -> 0

## Spliiting the dataset to training set(80%)and testing set(20%)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_res,y_res,
                                test_size=0.2,random_state=0)
print('Shape of the training set:'+" "+str(x_train.shape))
print('Size of the training set:'+" "+str(x_train.shape[0]))

print('Shape of the testing set:'+" "+str(x_test.shape))
print('Size of the testing set:'+" "+str(x_test.shape[0]))

## Preprocessing our tarining set

In [None]:
scaler = StandardScaler().fit(x_train)
train_scaled = scaler.transform(x_train)
test_scaled = scaler.transform(x_test)


## Build the model 

In [None]:
model = Sequential([
    Dense(32 ,activation='relu',input_shape=(30,)),
    Dense(32 ,activation='relu'),
    Dense(1,activation='sigmoid')
])
model.summary()

In [None]:
model.compile(optimizer='sgd',
             loss='binary_crossentropy',
             metrics=['accuracy'])
model.fit(train_scaled , y_train 
         ,epochs=100,batch_size=32 )
model.save('model_breast_cancer.h5')

In [None]:
pretrained_model = keras.models.load_model('./model_breast_cancer.h5')
eval_score = pretrained_model.evaluate(test_scaled,y_test)
print('The loss of the evaluation :'+' '+str(eval_score[0]))
print('The accuracy of the evaluation :'+' '+str(eval_score[1]))


## Predict the testset

In [None]:
y_pred = pretrained_model.predict_classes(test_scaled)
y_pred

## Evaluation the performance of our model

In [None]:
target_list=['benign','malignant']
classification_report(y_true=y_test,y_pred=y_pred
                      ,output_dict = True,target_names=target_list)

In [None]:
CV = confusion_matrix(y_true=y_test,y_pred=y_pred)
sns.heatmap(CV,annot=True )
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()