This notebook documents the search for a deep neural network model on the preprocessed data using keras-tuner. For the notebook to run on Google Colab, the project pipeline module and the data file need to be in the google drive.

In [1]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Mount the google drive in google colab.
from google.colab import drive
drive.mount('/content/drive')

# Insert the directory where the project pipieline file is stored.
import sys
sys.path.insert(0, '/content/drive/My Drive/Colab Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import dependencies.
import pandas as pd
import numpy as np

import tensorflow as tf
import keras_tuner as kt
from keras.metrics import AUC

from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from project_pipeline import preprocess

In [4]:
# Read in the cleaned data.
df = pd.read_csv('cleaned_mode.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  int64  
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  object 
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  last_new_job            19158 non-null  object 
 11  training_hours          19158 non-null  int64  
 12  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 1.9+ MB


In [5]:
# Preprocess the cleaned data in preparation for model implementation.
features = pd.get_dummies(df).drop(columns='target').columns.tolist()
X_train, X_test, y_train, y_test = preprocess(df)

In [6]:
# Create a method that creates a new Sequential model with hyperparameter options.
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # # Allow kerastuner to decide which activation function to use in hidden layers
    # activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer.
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=50,
        max_value=100,
        step=10), activation='relu', input_dim=len(features)))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers.
    for i in range(hp.Int('num_layers', 1, 2)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=10,
            max_value=50,
            step=10),
            activation='relu'))
    
    # Add output layer.
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model.
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=tf.keras.metrics.AUC())
    
    return nn_model

In [7]:
# Import the kerastuner library.
tuner = kt.Hyperband(
    create_model,
    objective=kt.Objective('val_auc', direction='max'),
    max_epochs=20,
    hyperband_iterations=2)

In [8]:
# Run the kerastuner search for best hyperparameters.
tuner.search(X_train, y_train, epochs=20, validation_data=(X_test,y_test))

Trial 60 Complete [00h 00m 42s]
val_auc: 0.790590763092041

Best val_auc So Far: 0.79603111743927
Total elapsed time: 00h 19m 11s


In [9]:
# Get best model hyperparameters.
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'first_units': 100,
 'num_layers': 2,
 'units_0': 30,
 'units_1': 40,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0}

In [10]:
# Evaluate best model against full test data.
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, ROC AUC: {model_accuracy}")

150/150 - 0s - loss: 0.5380 - auc: 0.7960 - 482ms/epoch - 3ms/step
Loss: 0.5379987359046936, ROC AUC: 0.79603111743927


In [11]:
# Make class predictions.
predictions = best_model.predict(X_test, verbose=0)
classes = np.where(predictions > 0.5, 1, 0)

In [12]:
# Display the classification report.
print(classification_report(y_test, classes.flatten()))

              precision    recall  f1-score   support

         0.0       0.90      0.75      0.82      3596
         1.0       0.50      0.75      0.60      1194

    accuracy                           0.75      4790
   macro avg       0.70      0.75      0.71      4790
weighted avg       0.80      0.75      0.76      4790

