In [None]:
#Importing library
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Loading dataset
---

In [None]:
row_data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

## Examine the data set
---

In [None]:
row_data.head(5)

In [None]:
row_data.columns

In [None]:
row_data.shape

**The dataset has 569 row and 33 columns**

In [None]:
row_data.info()

In [None]:
row_data.isnull().sum()

**Id and Unnamed: 32(it has a lot of null values) will be droped from the dataset.**

In [None]:
row_data.drop("id" , axis = 1 , inplace = True)
row_data.drop("Unnamed: 32" , axis = 1 , inplace = True)

In [None]:
row_data.info()

## Controling type of entries

In [None]:
row_data.tail(5)

* **"diagnosis" column has catecorical value.**
* **Apart from "diagnosis" has numerical value.**

## Visualization the dataset
---

### Count of entries

In [None]:
sns.countplot("diagnosis" , data = row_data , palette = "inferno")
plt.title("Count of diagnosis")
plt.show()
print(row_data["diagnosis"].value_counts())

In [None]:
fig = plt.figure(figsize = (15,15))
fig.subplots_adjust(hspace = 0.4 , wspace = 0.4)
for i in range(1 , 31):
    ax = fig.add_subplot(6 , 5 , i)
    sns.stripplot(x = "diagnosis" , y = row_data.columns[i] , data = row_data , palette = "inferno" , ax = ax , alpha = 0.5)
plt.show()

In [None]:
def _displot(data):
    fig = plt.figure(figsize=(15,15))
    fig.subplots_adjust(hspace = 0.4 , wspace = 0.4)
    
    for i in range( 0 , len(data.columns)):
        ax = fig.add_subplot(2,5, i+1 )
        sns.distplot(data.iloc[:,i])
    plt.show()

def _violinplot(data):
    fig = plt.figure(figsize=(15,15))
    fig.subplots_adjust(hspace = 0.4 , wspace = 0.4)
    
    data = pd.concat([row_data["diagnosis"] , data] , axis = 1)
    
    for i in range(1 , len(data.columns)):
        ax = fig.add_subplot(2,5, i )
        sns.violinplot(x = "diagnosis" , y = data.iloc[:,i] , data = data , inner = "point" , palette = "inferno")
    plt.show()
    
def _scatterplot(data):
    
    data = pd.concat([row_data["diagnosis"] , data] , axis = 1)
    
    g = sns.PairGrid(data , hue = "diagnosis" , palette = "inferno")
    g.map_upper(sns.scatterplot)
    plt.show()
    

def _visualization(data):
    
    _displot(data)
    _violinplot(data)
    _scatterplot(data)

### Visualization the MEAN columns
---

In [None]:
_visualization(row_data.iloc[:,1:11])

### Visualization the SE columns
---

In [None]:
_visualization(row_data.iloc[:,11:21])

### Visualization the WORST columns

In [None]:
_visualization(row_data.iloc[:,21:])

## Train - Test - Split
---

In [None]:
categories = row_data["diagnosis"].copy()
input_data = row_data.iloc[:,1:].copy()

print(categories.tail(15))

In [None]:
# The categorical values are converted to numerical values
categories[categories == "M"] = 0
categories[categories == "B"] = 1

print(categories.tail(15))

In [None]:
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(input_data , categories , random_state = 0 , test_size = 0.33)

print("x_train shape:" , x_train.shape)
print("y_train shape:" , y_train.shape)
print("x_test shape:" , x_test.shape)
print("y_test shape:" , y_test.shape)

## Scaling Processing
---

In [None]:
from sklearn.preprocessing import StandardScaler

st = StandardScaler()

x_train = st.fit_transform(x_train)
x_test = st.transform(x_test)

print(x_train[1,:5] , x_train.shape)
print(x_test[1,:5] , x_test.shape)

In [None]:
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

## Building Keras Model
---

In [None]:
from keras import models , layers

def build_model(hl , lt):
    model = models.Sequential()
    model.add(layers.Dense(hl , activation = "relu" , input_shape = (x_train.shape[1] , )))
    
    for i in range(1 , lt):
        model.add(layers.Dense(hl , activation = "relu"))
        
    model.add(layers.Dense(1 , activation = "sigmoid"))
    model.compile(optimizer="rmsprop" , loss = "binary_crossentropy" , metrics = ["accuracy"] )
    
    return model
    

## Building K-Fold
---
### Why k-fold?
* The dataset must be splitted 3 pieces: training, validation and test. When the dataset splits, it will have 381 row for training. And then, the training dataset will be splitted 2 pieces for training and validation. After all of this process, there will be relatively few data to train the ML model.
* This situation can cause different validation score, depending on which data  will be validation data.
* The validation score will have high variance, according to validation dataset.
* K-Fold is used to avoid this situation.

### How does It work?
1. Determine a K values.
2. Split dataset into K parts.
3. Determine a piece for validation, and use others(k-1 pieces) for training.
4. Train ML model with k-1 pieces, and Test it with validation dataset.
5. Save the validation score.
6. Repeat 3rd step to 5th step, till every pieces is used for validation.
7. Finally, Calculate average the validation scores which are saved in 5 step.
8. The average validation score show success of the Ml model.


In [None]:
def k_fold(k , num_epochs , hl , lt):
    all_acc = []
    all_loss = []
    
    all_val_acc = []
    all_val_loss = []
    
    num_val_sample = len(x_train) // k
    
    for i in range(k):
        print("Process" , i+1)
        
        val_data = x_train[i*num_val_sample : (i+1)*num_val_sample]
        val_target = y_train[i*num_val_sample : (i+1)*num_val_sample]
        
        partial_train_data = np.concatenate([x_train[:i*num_val_sample] , x_train[(i+1)*num_val_sample :]] , axis = 0)
        partial_train_target = np.concatenate([y_train[:i*num_val_sample] , y_train[(i+1)*num_val_sample :]] , axis = 0)
        
        model = build_model(hl , lt)
        
        history = model.fit(partial_train_data,
                            partial_train_target, 
                            epochs = num_epochs, 
                            batch_size = 15, 
                            verbose = 0, 
                            validation_data = (val_data , val_target))
        
        all_loss.append(history.history["loss"])
        all_acc.append(history.history["accuracy"])
        
        all_val_loss.append(history.history["val_loss"])
        all_val_acc.append(history.history["val_accuracy"])
        
    
    average_loss = [np.mean([j[i] for j in all_loss]) for i in range(num_epochs)]
    average_acc = [np.mean([j[i] for j in all_acc]) for i in range(num_epochs)]
    
    average_val_loss = [np.mean([j[i] for j in all_val_loss]) for i in range(num_epochs)]
    average_val_acc = [np.mean([j[i] for j in all_val_acc]) for i in range(num_epochs)]
    
    return{"loss" : average_loss,
           "accuracy" : average_acc,
           "val_loss" : average_val_loss,
           "val_accuracy" : average_val_acc}

In [None]:
def Draw(epochs , model):
    
    plt.title("Val and Train Accuracy")
    plt.plot(range(1 , epochs+1) , model["accuracy"] , "bo" , label = "Train Acc")
    plt.plot(range(1 , epochs+1) , model["val_accuracy"] , "r" , label = "Val Acc")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()
    
    plt.title("Val and Train Loss")
    plt.plot(range(1 , epochs+1) , model["loss"] , "bo" , label = "Train Loss")
    plt.plot(range(1 , epochs+1) , model["val_loss"] , "r" , label = "Val Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [None]:
k_model = k_fold(4,100,32,3)
Draw(100 , k_model)

* **The parameters cause overfitting**
* **The model will be trained with less parameters.**

In [None]:
k_model = k_fold(4,100,16,2)
Draw(100,k_model)

* **This looks like better.**
* **Let's reduce the epochs to see better.**

In [None]:
k_model = k_fold(4,30,16,2)
Draw(30,k_model)

* **The model cause overfitting, after 12th epochs.**
* **Let's create new model and test it on the test dataset, after It trains.**

In [None]:
last_model = build_model(16,2)
last_model.fit(x_train , y_train , epochs = 12 , batch_size = 15)
results = last_model.evaluate(x_test , y_test)
print(results)