<div width="100%">
    <img width="100%" src="https://storage.googleapis.com/kaggle-datasets-images/228/482/a520351269b547c89afe790820a1087e/dataset-cover.jpeg"/>
</div>

In [None]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from IPython.display import clear_output
import tensorflow as tf

<h1 id="dataset" style="color:#301202; background:#d26231; border:0.5px dotted;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

In [None]:
path = '../input/pima-indians-diabetes-database/diabetes.csv'
df = pd.read_csv(path)
df.fillna(df.mean(), inplace=True)
df = shuffle(df)
df.head()

In [None]:
features = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","Age"]

In [None]:
for feat in features:
    df[feat] /= df[feat].max()
df.head()

In [None]:
_ = plt.figure(figsize=(14,8))
_ = sns.heatmap(df.corr(), 
        xticklabels=df.corr().columns,
        yticklabels=df.corr().columns)
plt.show()

In [None]:
features = df.drop('Outcome', axis=1)
labels = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                        features, labels, test_size=0.20, random_state=42)

<h1 id="ann" style="color:#301202; background:#d26231; border:0.5px dotted;"> 
    <center>Artificial neural network
        <a class="anchor-link" href="#ann" target="_self">¶</a>
    </center>
</h1>

## Model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='sigmoid', input_shape=(8, )),
    tf.keras.layers.Dense(32, activation='sigmoid'),
    tf.keras.layers.Dense(64, activation='sigmoid'),
    tf.keras.layers.Dense(16, activation='sigmoid'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

## Loss Function

In [None]:
loss_object = tf.keras.losses.BinaryCrossentropy(
                    from_logits=False, label_smoothing=0, 
                    name='binary_crossentropy'
                )

In [None]:
def loss(model, x, y, training):
    y_ = model(x, training=training)

    return loss_object(y_true=y, y_pred=y_)

## Gradient function

In [None]:
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, training=True)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)

## Optimizer

In [None]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01, rho=0.1, 
                                        momentum=0.1, epsilon=1e-03)

## Training

In [None]:
y_temp = y_train.values
X, y = X_train.values, y_temp.reshape(y_temp.shape[0], 1)

y_temp = y_test.values
X_val, y_val = X_test.values, y_temp.reshape(y_temp.shape[0], 1)

In [None]:
num_epochs = 3001

train_loss_results = []
train_accuracy_results = []

test_loss_results = []
test_accuracy_results = []

loss_fn = tf.keras.metrics.Mean()
acc_fn = tf.keras.metrics.BinaryAccuracy()

for epoch in range(num_epochs):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.BinaryAccuracy()

    batches = np.array_split(np.arange(len(X)), len(X) // 8)
    batches = [b.tolist() for b in batches]

    for batch in batches:
        X_b, y_b = X[batch], y[batch]
        # Optimize the model
        loss_value, grads = grad(model, X_b, y_b)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y_b, model(X_b, training=True))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())
        
    test_loss_results.append(loss_fn(y_val, model(X_val)).numpy())
    test_accuracy_results.append(acc_fn(y_val, model(X_val)).numpy())

    if epoch % 300 == 0:
        print("Epoch {:3d}: Train_Loss:{:3.3f}, Train_Accuracy:{:3.3f}, Test_Loss:{:3.3f}, Test_Accuracy:{:3.3f}"
              .format(epoch, epoch_loss_avg.result(), epoch_accuracy.result(),
                      test_loss_results[-1], test_accuracy_results[-1]))

## Analysis

In [None]:
fig, axes = plt.subplots(2, sharex=True, figsize=(14, 8))
fig.suptitle('Training Metrics')

axes[0].set_ylabel("Loss", fontsize=14)
axes[0].plot(train_loss_results)

axes[1].set_ylabel("Accuracy", fontsize=14)
axes[1].set_xlabel("Epoch", fontsize=14)
axes[1].plot(train_accuracy_results)
plt.show()

In [None]:
fig, axes = plt.subplots(2, sharex=True, figsize=(14, 8))
fig.suptitle('Testing Metrics')

axes[0].set_ylabel("Loss", fontsize=14)
axes[0].plot(test_loss_results)

axes[1].set_ylabel("Accuracy", fontsize=14)
axes[1].set_xlabel("Epoch", fontsize=14)
axes[1].plot(test_accuracy_results)
plt.show()

<h1 id="boosted" style="color:#301202; background:#d26231; border:0.5px dotted;"> 
    <center>Boosted Trees
        <a class="anchor-link" href="#boosted" target="_self">¶</a>
    </center>
</h1>

## Feature columns

In [None]:
fc = tf.feature_column
NUMERIC_COLUMNS = list(df.drop('Outcome', axis=1).columns)

def one_hot_cat_column(feature_name, vocab):
    return fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name,
                                                 vocab))
feature_columns = []

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(fc.numeric_column(feature_name,
                                           dtype=tf.float32))

## Boosted Trees Classifier

In [None]:
n_batches = 4

est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=n_batches)

## Create dataset

In [None]:
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).
        dataset = dataset.repeat(n_epochs)
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn

In [None]:
train_input_fn = make_input_fn(X_train, y_train)
eval_input_fn = make_input_fn(X_test, y_test, shuffle=False, n_epochs=1)

## Train Boosted Trees

In [None]:
NUM_EXAMPLES = len(y_train)

est.train(train_input_fn, max_steps=300)

## Analysis

In [None]:
result = est.evaluate(eval_input_fn)
clear_output()

print(pd.Series(result))

In [None]:
pred_dicts = list(est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()