# Steps to a Machine Learning Pipeline

- Importing Libraries
- Loading and Preprocessing Data 
- Creating a validation set
- Defining the model structure 
- Training the model
- Making predictions 


In [None]:
# Mounting Google Drive
from google.colab import drive

drive.mount('/content/drive')

## Uncomment to unmount
# drive.flush_and_unmount()

In [None]:
import os

csv_path = '/content/drive/MyDrive/dataset.csv'
dataset_path = '/content/drive/MyDrive/dataset/'
OUTPUT_PATH = '/content/drive/MyDrive/output/'

print("dataset loaded")

# for line in open(os.path.join(file_prefix, 'clean_dataset.txt')):
#     print(line.strip())

In [None]:
# initialize the input shape and number of classes
INPUT_SHAPE = (28, 28, 1)
NUM_CLASSES = 1

# define the total number of epochs to train, batch size, and the
# early stopping patience
EPOCHS = 50
BS = 8
EARLY_STOPPING_PATIENCE = 5

## Loading and Preprocessing

In [None]:
!pip install tensorflow==2.5.0

## Importing libraries for building the model
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.metrics import RootMeanSquaredError

from tensorflow.keras.preprocessing import image
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Split into train and test
dataset = pd.read_csv(csv_path)

print(dataset)

## Scale y label to [0, 1] range
dataset['label'] = dataset['label'] / 100

# Doing an 80/20 train/test split without evenly splitting from each label
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

print("Train label count")
print(train['label'].value_counts())

print("Test label count")
print(test['label'].value_counts())

In [None]:
# Loading train images
train_images = []

train_np = train['img'].to_numpy()

for i in tqdm(range(train.shape[0])):
    img_path = dataset_path + train_np[i]
    print(img_path)
    img = image.load_img(img_path, target_size=(28,28,1), color_mode="grayscale")
    img = image.img_to_array(img)
    img = img/255 # normalise
    train_images.append(img)

X_train = np.array(train_images)

y_train = train['label'].values

In [None]:
# Loading test images
test_images = []

test_np = test['img'].to_numpy()

for i in tqdm(range(test.shape[0])):
    img_path = dataset_path + test_np[i]
    print(img_path)

    img = image.load_img(img_path, target_size=(28,28,1), color_mode="grayscale")
    img = image.img_to_array(img)
    img = img/255
    test_images.append(img)

X_test = np.array(test_images)

y_test = test['label'].values

In [None]:
# Creating a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=44, test_size=0.1)

print("size of train and validation set")
print(y_train.shape, y_val.shape)

print("Train label count")
print(np.unique(y_train[:], return_counts=True))

print("Test label count")
print(np.unique(y_val[:], return_counts=True))

# Hyperparameter Tuning

In [None]:
!pip install keras-tuner

from tensorflow.keras.callbacks import EarlyStopping
import kerastuner as kt

In [None]:
def build_model(hp):
    model = Sequential()
    inputShape = INPUT_SHAPE

    # number of conv -> relu -> pool blocks
    for i in range(hp.Int('conv_blocks', min_value=1, max_value=2, step=1)):
        filters = hp.Int('conv_' + str(i), min_value=32, max_value=256, step=32)

        if i == 0:
            model.add(Conv2D(filters, kernel_size=(3, 3), activation='relu', 
            input_shape=inputShape, padding='same'))
        else:
            model.add(Conv2D(filters, kernel_size=(3, 3), activation='relu',
            padding='same'))

        if hp.Choice("pooling_"+ str(i), values=["max", "avg"]) == 'max':
            model.add(MaxPooling2D(pool_size=(2,2)))
        else:
            model.add(AveragePooling2D(pool_size=(2,2)))

    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_1', 0, 0.5, step=0.1, default=0.5)))

    model.add(Flatten())
    model.add(Dense(hp.Int("dense_units", min_value=128, max_value=512, step=128), 
    activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_2', 0, 0.5, step=0.1, default=0.5)))

    activation = hp.Choice("output_activation", values=["sigmoid", "relu"])
    model.add(Dense(NUM_CLASSES, activation=activation))

    lr = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

    if hp.Choice("optimiser", values =["adam", "sgd"]) == "adam":
        opt = keras.optimizers.Adam(learning_rate=lr)
    else:
        opt = keras.optimizers.SGD(learning_rate=lr)

    # compile the model
    model.compile(optimizer=opt, loss="mse",
    metrics=[RootMeanSquaredError()])

    return model

In [None]:
# initialize an early stopping callback to prevent the model from
# overfitting/spending too much time training with minimal gains
es = EarlyStopping(
	monitor="val_loss",
	patience=EARLY_STOPPING_PATIENCE,
	restore_best_weights=True)

In [None]:
# Using the hyperband tuning 
# The Hyperband tuner is a combination of random search with 
# “adaptive resource allocation and early stopping.” 

tuner = kt.Hyperband(
		build_model,
		objective=kt.Objective("val_root_mean_squared_error", direction="min"),
		max_epochs=EPOCHS,
		factor=3,
		seed=42,
		directory=OUTPUT_PATH,
		project_name="small_search")

In [None]:
print("[INFO] performing hyperparameter search...")
tuner.search(
	x=X_train, y=y_train,
	validation_split=0.2,
	batch_size=BS,
	callbacks=[es],
	epochs=EPOCHS,
)

In [None]:
bestHP = tuner.get_best_hyperparameters(num_trials=1)[0]

print("[INFO] optimal number of conv blocks: {}".format(
	bestHP.get("conv_blocks")))
print("[INFO] optimal number of filters in conv_1 layer: {}".format(
	bestHP.get("conv_1")))
print("[INFO] optimal number of filters in conv_2 layer: {}".format(
	bestHP.get("conv_2")))
print("[INFO] optimal type of pooling 1 layer: {}".format(
	bestHP.get("pooling_1")))
print("[INFO] optimal type of pooling 2 layer: {}".format(
	bestHP.get("pooling_2")))
print("[INFO] optimal dropout 1 rate: {}".format(
	bestHP.get("dropout_1")))
print("[INFO] optimal number of units in dense layer: {}".format(
	bestHP.get("dense_units")))
print("[INFO] optimal dropout 2 rate: {}".format(
	bestHP.get("dropout_2")))
print("[INFO] optimal output activation function: {}".format(
	bestHP.get("output_activation")))
print("[INFO] optimal learning rate: {:.4f}".format(
	bestHP.get("learning_rate")))
print("[INFO] optimal optimiser: {}".format(
	bestHP.get("optimiser")))

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs to find best number of epochs to train for
model = tuner.hypermodel.build(bestHP)

print(model.summary())

In [None]:
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2)

In [None]:
val_rmse = history.history['val_root_mean_squared_error']
best_epoch = val_rmse.index(min(val_rmse)) + 1
print('Best epoch: %d' % (best_epoch,))

# Training the Final Model

In [None]:
## Train the model again on the tuned number of epoches
hypermodel = tuner.hypermodel.build(bestHP)

# Retrain the model
history = hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_split=0.2)

# Evaluate
eval_result = hypermodel.evaluate(X_test, y_test)
print("[test loss, test rmse]:", eval_result)

## Defining the Model

After hyperparameter tuning, train the model with the tuned values

In [None]:
model = Sequential()
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(28,28,1)))
model.add(AveragePooling2D(pool_size=(2, 2)))
# model.add(Conv2D(192, (3, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.0008)

model.compile(loss='mse', optimizer=optimizer, metrics=[RootMeanSquaredError()])

## Model Training

In [None]:
self_history = model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val), batch_size=8)

## Making Predictions

In [None]:
# making predictions
prediction = model.predict(X_test)

print(prediction.shape)

## Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math

for i in range(20):
    print(prediction[i], y_test[i])

# Print R2 Score 
print("R2 Score: {}".format(r2_score(y_test, prediction)))
print("MSE: {}".format(mean_squared_error(y_test, prediction)))
print("RMSE: {}".format(math.sqrt(mean_squared_error(y_test, prediction))))
print("MAE: {}".format(mean_absolute_error(y_test, prediction)))

## Saving Models

In [None]:
## Save Keras SavedModel for future
hypermodel.save('model.h5')

!cp model.h5 "/content/drive/My Drive/"

## Plotting graphs for evaluation

In [None]:
plt.plot(self_history.history['loss'])
plt.plot(self_history.history['val_loss'])
plt.title('Training Loss Against Epochs')
plt.ylabel('Mean Square Error Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.savefig('final.png')

In [None]:
!cp final.png "/content/drive/My Drive/"