This notebook documents the implementation of a deep neural network model on the preprocessed data.

In [1]:
# Import dependencies.
import pandas as pd
import numpy as np
import tensorflow as tf

from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Read in the cleaned data.
df = pd.read_csv('../resources/cleaned_mode.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  int64  
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  object 
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  last_new_job            19158 non-null  object 
 11  training_hours          19158 non-null  int64  
 12  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 1.9+ MB


In [39]:
# Use `get_dummies` to encode all categorical features.
df = pd.get_dummies(df)
df.head()

Unnamed: 0,city_development_index,training_hours,target,city_Other,city_city_102,city_city_103,city_city_104,city_city_114,city_city_136,city_city_16,...,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
0,0.92,36,1,0,0,1,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,0.776,47,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0.624,83,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0.789,52,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0.767,8,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [40]:
# Split the data into a training set and a testing set.
y = df.target
X = df.drop(columns='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [41]:
# Due to imbalanced target values, instantiate the random oversampler model.
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train_scaled, y_train)

In [42]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X.columns)
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 50

# Instantiate a deep neural network model.
nn = tf.keras.models.Sequential()

# Add first hidden layer.
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Add second hidden layer.
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Add output layer.
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the model structure.
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               5700      
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 10,801
Trainable params: 10,801
Non-trainable params: 0
_________________________________________________________________


In [43]:
# Compile the model.
nn.compile(loss='binary_crossentropy', 
           optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
           metrics=['accuracy'])

In [44]:
# Train the model.
iters = 50
fit_model = nn.fit(X_res, y_res, epochs=iters)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [45]:
# Evaluate the model accuracy using the test data.
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f'Loss: {model_loss}, Accuracy: {model_accuracy}')

150/150 - 0s - loss: 0.5927 - accuracy: 0.7213 - 338ms/epoch - 2ms/step
Loss: 0.5927190184593201, Accuracy: 0.7212943434715271


In [46]:
# Make class predictions.
predictions = nn.predict(X_test_scaled, verbose=0)
classes = np.where(predictions > 0.5, 1, 0)

In [51]:
# Display the classification report.
print(classification_report(y_test, classes.flatten()))

              precision    recall  f1-score   support

           0       0.89      0.71      0.79      3596
           1       0.46      0.74      0.57      1194

    accuracy                           0.72      4790
   macro avg       0.68      0.73      0.68      4790
weighted avg       0.79      0.72      0.74      4790

