In [1]:
# Import libraries

import pandas as pd
import numpy as np

import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation

%matplotlib inline 
sns.set(color_codes=True)

# create tables
from tabulate import tabulate

### Import Test data file

In [2]:
# read in the .csv file - of filtered student_info

data_df = pd.read_csv("test_data.csv")

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 16 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   id_student                                     749 non-null    int64
 1   gender                                         749 non-null    int64
 2   studied_credits                                749 non-null    int64
 3   tenure                                         749 non-null    int64
 4   highest_education_A Level or Equivalent        749 non-null    int64
 5   highest_education_HE Qualification             749 non-null    int64
 6   highest_education_Lower Than A Level           749 non-null    int64
 7   highest_education_No Formal quals              749 non-null    int64
 8   highest_education_Post Graduate Qualification  749 non-null    int64
 9   age_band_0-35                                  749 non-null    int64
 10  ag

### Algorithm 5 - MPL

In [4]:
# machine learning libraries
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# model layers
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
#from scikeras.wrappers import KerasClassifier, KerasRegressor

In [5]:
# Function to create and return the Keras model

def create_model(optimizer='adam', neurons_layer1=20, neurons_layer2=10):
    
    model = Sequential()
    
    # Hidden layer 1 with neuron count taken from the param_grid values, and relu as the activation function
    model.add(Dense(neurons_layer1, input_dim=X_train.shape[1], activation='relu'))
    
    # Hidden layer 2 with neuron count taken from the param_grid values, and relu as the activation function
    model.add(Dense(neurons_layer2, activation='relu'))
    
    # Final layer with 1 neuron, and sigmoid as the activation function
    # use 'softmax' for multiclass classification
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    # use 'categorical_crossentropy' for multiclass classification
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [6]:
# wrap the model using KerasClassifier 

model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)
#model = scikeras(build_fn=create_model, epochs=10, batch_size=32, verbose=0)

  model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)


### Hyperparm tuning - GridSearch CV

In [7]:
# Define hyperparameter grid
param_grid = {
    'optimizer': ['SGD', 'RMSprop', 'Adam'],
    'batch_size': [16, 32, 64],
    'neurons_layer1': [10, 20, 30],
    'neurons_layer2': [5, 10, 15],
    'epochs': [10, 20]
}

In [8]:
# divide the dataset into features (X) and the target variable (y)

# variable 1 = gender
X = data_df.drop(columns=['gender'])  # drop target variable from the dataframe
y = data_df['gender']        # id target variable 

# variable 2 = studied_credits
X2 = data_df.drop(columns=['studied_credits'])
y2 = data_df['studied_credits']

# variable 3 = tenure
X3 = data_df.drop(columns=['tenure'])
y3 = data_df['tenure']

In [9]:
# split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=0)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=0)

### Fit and tune with model for variable 1 = gender

In [10]:
# Perform grid search using GridSearchCV

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

grid_result = grid.fit(X_train, y_train)

In [11]:
# Print the best hyperparameters

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.808065 using {'batch_size': 16, 'epochs': 10, 'neurons_layer1': 10, 'neurons_layer2': 5, 'optimizer': 'SGD'}


In [12]:
# Create the best model using the best hyperparameters
best_model = create_model(optimizer=grid_result.best_params_['optimizer'], 
                          neurons_layer1=grid_result.best_params_['neurons_layer1'],
                          neurons_layer2=grid_result.best_params_['neurons_layer2'])

In [13]:
# Fit the best model
best_model.fit(X_train, y_train, epochs=grid_result.best_params_['epochs'], 
               batch_size=grid_result.best_params_['batch_size'], verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x252202fc0d0>

In [14]:
# Evaluate the model on test data
score = best_model.evaluate(X_test, y_test, verbose=0)

In [15]:
print(f'Test loss for gender: {score[0]}')
print(f'Test accuracy gender: {score[1]}')


Test loss for gender: 0.5401294827461243
Test accuracy gender: 0.7933333516120911


### Fit and tune with model for variable 2 - studied_credits¶

In [16]:
# Perform grid search using GridSearchCV

grid2 = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

grid_result2 = grid.fit(X_train, y_train)

In [17]:
# Print the best hyperparameters

print("Best: %f using %s" % (grid_result2.best_score_, grid_result2.best_params_))

Best: 0.808065 using {'batch_size': 16, 'epochs': 10, 'neurons_layer1': 10, 'neurons_layer2': 5, 'optimizer': 'SGD'}


In [18]:
# Create the best model using the best hyperparameters
best_model2 = create_model(optimizer=grid_result.best_params_['optimizer'], 
                          neurons_layer1=grid_result.best_params_['neurons_layer1'],
                          neurons_layer2=grid_result.best_params_['neurons_layer2'])

In [19]:
# Fit the best model
best_model2.fit(X_train2, y_train2, epochs=grid_result.best_params_['epochs'], 
               batch_size=grid_result.best_params_['batch_size'], verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2523097eac0>

In [20]:
# Evaluate the model on test data
score2 = best_model2.evaluate(X_test2, y_test2, verbose=0)

In [21]:
print(f'Test loss for studied_credits: {score2[0]}')
print(f'Test accuracy studied_credits: {score2[1]}')


Test loss for studied_credits: -4097.91162109375
Test accuracy studied_credits: 0.0


### Fit and tune with model for variable 3 - tenure

In [22]:
# Perform grid search using GridSearchCV

grid3 = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

grid_result3 = grid3.fit(X_train3, y_train3)

In [23]:
# Print the best hyperparameters

print("Best: %f using %s" % (grid_result3.best_score_, grid_result3.best_params_))

Best: 0.068451 using {'batch_size': 32, 'epochs': 10, 'neurons_layer1': 30, 'neurons_layer2': 5, 'optimizer': 'SGD'}


In [24]:
# Create the best model using the best hyperparameters
best_model3 = create_model(optimizer=grid_result.best_params_['optimizer'], 
                          neurons_layer1=grid_result.best_params_['neurons_layer1'],
                          neurons_layer2=grid_result.best_params_['neurons_layer2'])

In [25]:
# Fit the best model
best_model3.fit(X_train3, y_train3, epochs=grid_result.best_params_['epochs'], 
               batch_size=grid_result.best_params_['batch_size'], verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25239e7eeb0>

In [26]:
# Evaluate the model on test data
score3 = best_model3.evaluate(X_test, y_test, verbose=0)

In [27]:
print(f'Test loss for tenure: {score3[0]}')
print(f'Test accuracy tenure: {score3[1]}')

Test loss for tenure: 6.608107089996338
Test accuracy tenure: 0.7933333516120911


### Display the results of all 3 tests

In [28]:
algorithm_5_results = {'gender': [{score[1]}], 
        'studied_credits': [{score2[1]}], 
        'Tenure':[{score3[1]}]}

print(tabulate(algorithm_5_results, headers='keys', tablefmt='fancy_grid'))

╒══════════════════════╤═══════════════════╤══════════════════════╕
│ gender               │ studied_credits   │ Tenure               │
╞══════════════════════╪═══════════════════╪══════════════════════╡
│ {0.7933333516120911} │ {0.0}             │ {0.7933333516120911} │
╘══════════════════════╧═══════════════════╧══════════════════════╛
