In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
import seaborn as sns

import imblearn
from imblearn.over_sampling import SMOTE

import keras
from keras.models import Sequential
from keras.layers import Dense
%matplotlib inline

In [None]:
# Load dataset
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
# Check empty cells
df.isnull().sum()

# Remove empty cells
df = df.drop(['Unnamed: 32'], axis=1)
df = df.drop(['id'], axis=1)

In [None]:
# View dataset
df.describe()

In [None]:
# Check for duplicate rows
duplicate = df.duplicated()
print(duplicate.sum())

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Benign', 'Malignant'
counts = df['diagnosis'].value_counts()
num_benign = counts[0]
num_malignant = counts[1]
sizes = [num_benign, num_malignant]

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()


In [None]:
# Create function for feature selection
def feature_selection(df, min):
    a_list = []
    corr_matrix = df.corr()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if min < abs(corr_matrix.iloc[i,j]) < 1:
                a_list.append(i+1)
    final_list = list(set(a_list))
    df = df.drop(df.columns[final_list], axis=1)
    input_shape = df.shape[1]
    return input_shape, df

In [None]:
# Run Feature_selection-function
input_shape, df = feature_selection(df, 0.95)

In [None]:
# Split dataset into data and target
df_target = df['diagnosis']
df_data = df.drop(['diagnosis'], axis=1)
input_shape = input_shape - 1

In [None]:
# Dealing with unbalanced dataset
smote = SMOTE(random_state=0)
df_data,df_target = smote.fit_resample(df_data,df_target)
print(df_target.value_counts())

In [None]:
# Change M and B into 1 and B respectively
encoder = LabelEncoder().fit(df_target)
df_target = encoder.transform(df_target)

In [None]:
# Split dataset into train-set and test-set
train_data, test_data, train_target, test_target = train_test_split(df_data, df_target, test_size=.2, random_state=0)

In [None]:
# Scale values in the train-set and test-set
scaler = StandardScaler().fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

In [None]:
# Create NN
model = Sequential([
    Dense(32 ,activation='relu',input_shape=(input_shape,)),
    Dense(16 ,activation='relu'),
    Dense(16 ,activation='relu'),
    Dense(1,activation='sigmoid')
])
model.summary()

In [None]:
# Train model
model.compile(optimizer='sgd',
             loss='binary_crossentropy',
             metrics=['accuracy'])
model.fit(train_data , train_target 
         ,epochs=200,batch_size=32 )
model.save('model_breast_cancer.h5')

In [None]:
# Calcluate loss and accuracy on test-set
trained_model = keras.models.load_model('./model_breast_cancer.h5')
evaluation_score = trained_model.evaluate(test_data,test_target)
print('loss : '+' '+str(evaluation_score[0]))
print('accuracy :'+' '+str(evaluation_score[1]))

In [None]:
# Predict targets of test_data
y_predicted = trained_model.predict_classes(test_data)

In [None]:
# Evaluate performance of the model
target=['benign','malignant']
classification_report(y_true=test_target, y_pred=y_predicted, output_dict = True, target_names=target)

In [None]:
# Evaluate performance of the model
CV = confusion_matrix(y_true=test_target, y_pred=y_predicted)
sns.heatmap(CV, annot=True)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()