In [None]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation

%matplotlib inline 
sns.set(color_codes=True)

In [None]:
# read in the .csv file - of filtered student_info

data_df = pd.read_csv("C:/Users/sinea/OneDrive/Documents OneDrive/06 - CCT Masters in DA/Capstone - 2023/Capstone_Project_2023/Python workings notebooks/filtered_df2.csv")

In [None]:
#data_df.info()

### One-Hot Encoding - categorical data

In [None]:
# One-Hot Encoding for 'highest_education', 'age_band', 'final_result', 'tenure_band'

data_df_encoded = pd.get_dummies(data_df, columns=['highest_education', 'age_band', 'final_result'])

In [None]:
# convert the dtype of the recently one-hot encoded columns to int64 from uint8 dtype

columns_to_convert = [
    'highest_education_A Level or Equivalent',
    'highest_education_HE Qualification',
    'highest_education_Lower Than A Level',
    'highest_education_Post Graduate Qualification',
    'age_band_0-35',
    'age_band_35-55',
    'age_band_55<=',
    'final_result_Distinction',
    'final_result_Fail',
    'final_result_Pass',
    'final_result_Withdrawn', 
]

# Convert the selected columns to int64 dtype
data_df_encoded[columns_to_convert] = data_df_encoded[columns_to_convert].astype('int64')

In [None]:
data_df_encoded.info()

### Algorithm 5 - Multilayer Perceptron (MLP)

In [None]:
# machine learning libraries
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# model layers
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

### Hyperparameter Tuning with GridSearchCV

#### References 
1. https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/
2. https://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/
3. https://www.geeksforgeeks.org/hyperparameter-tuning-using-gridsearchcv-and-kerasclassifier/

In [None]:
# Function to create and return the Keras model

def create_model(optimizer='adam', neurons_layer1=20, neurons_layer2=10):
    
    model = Sequential()
    
    # Hidden layer 1 with neuron count taken from the param_grid values, and relu as the activation function
    model.add(Dense(neurons_layer1, input_dim=X_train.shape[1], activation='relu'))
    
    # Hidden layer 2 with neuron count taken from the param_grid values, and relu as the activation function
    model.add(Dense(neurons_layer2, activation='relu'))
    
    # Final layer with 1 neuron, and sigmoid as the activation function
    # use 'softmax' for multiclass classification
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    # use 'categorical_crossentropy' for multiclass classification
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [None]:
# Wrap the model using KerasClassifier 
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)


In [None]:
# Define hyperparameter grid
param_grid = {
    'optimizer': ['SGD', 'RMSprop', 'Adam'],
    'batch_size': [16, 32, 64],
    'neurons_layer1': [10, 20, 30],
    'neurons_layer2': [5, 10, 15],
    'epochs': [10, 20]
}

In [None]:
# divide the dataset into features (X) and the target variable (y)

# variable 1 = gender
X = data_df_encoded.drop(columns=['gender'])  # drop target variable from the dataframe
y = data_df_encoded['gender']        # id target variable 

# variable 2 = studied_credits
X2 = data_df_encoded.drop(columns=['studied_credits'])
y2 = data_df_encoded['studied_credits']

# variable 3 = tenure
X3 = data_df_encoded.drop(columns=['tenure'])
y3 = data_df_encoded['tenure']

In [None]:
# split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=0)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=0)

### Fit and tune with model for variable 1 = gender 

In [None]:
# Perform grid search using GridSearchCV

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

grid_result = grid.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Create the best model using the best hyperparameters
best_model = create_model(optimizer=grid_result.best_params_['optimizer'], 
                          neurons_layer1=grid_result.best_params_['neurons_layer1'],
                          neurons_layer2=grid_result.best_params_['neurons_layer2'])

In [None]:
# Fit the best model
best_model.fit(X_train, y_train, epochs=grid_result.best_params_['epochs'], 
               batch_size=grid_result.best_params_['batch_size'], verbose=1)

In [None]:
# Evaluate the model on test data
score = best_model.evaluate(X_test, y_test, verbose=0)

In [None]:
print(f'Test loss for gender: {score[0]}')
print(f'Test accuracy gender: {score[1]}')


### Fit and tune with model for variable 2 = studied_credits 

In [None]:
# Perform grid search using GridSearchCV

grid2 = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

grid_result2 = grid.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters

print("Best: %f using %s" % (grid_result2.best_score_, grid_result2.best_params_))

In [None]:
# Create the best model using the best hyperparameters
best_model2 = create_model(optimizer=grid_result.best_params_['optimizer'], 
                          neurons_layer1=grid_result.best_params_['neurons_layer1'],
                          neurons_layer2=grid_result.best_params_['neurons_layer2'])

In [None]:
# Fit the best model
best_model2.fit(X_train2, y_train2, epochs=grid_result.best_params_['epochs'], 
               batch_size=grid_result.best_params_['batch_size'], verbose=1)

In [None]:
# Evaluate the model on test data
score2 = best_model2.evaluate(X_test2, y_test2, verbose=0)

In [None]:
print(f'Test loss for studied_credits: {score2[0]}')
print(f'Test accuracy studied_credits: {score2[1]}')


### Fit and tune with model for variable 3 = tenure¶

In [None]:
# Perform grid search using GridSearchCV

grid3 = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

grid_result3 = grid3.fit(X_train3, y_train3)

In [None]:
# Print the best hyperparameters

print("Best: %f using %s" % (grid_result3.best_score_, grid_result3.best_params_))

In [None]:
# Create the best model using the best hyperparameters
best_model3 = create_model(optimizer=grid_result.best_params_['optimizer'], 
                          neurons_layer1=grid_result.best_params_['neurons_layer1'],
                          neurons_layer2=grid_result.best_params_['neurons_layer2'])

In [None]:
# Fit the best model
best_model3.fit(X_train3, y_train3, epochs=grid_result.best_params_['epochs'], 
               batch_size=grid_result.best_params_['batch_size'], verbose=1)

In [None]:
# Evaluate the model on test data
score3 = best_model3.evaluate(X_test, y_test, verbose=0)

In [None]:
print(f'Test loss for tenure: {score3[0]}')
print(f'Test accuracy tenure: {score3[1]}')

### Plotting the Output

In [None]:
# fit the model and store the output for graphing
#history = model.fit(X_train, y_train, epochs=10, batch_size=32)
#history2 = model.fit(X_train2, y_train2, epochs=10, batch_size=32)
#history3 = model.fit(X_train3, y_train3, epochs=10, batch_size=32)

In [None]:
# Plotting the accuracy
#plt.figure(figsize=(5,5))
#plt.plot(history.history['accuracy'])
#plt.title('Model Accuracy')
#plt.ylabel('Accuracy')
#plt.xlabel('Epoch')
#plt.legend(['Train'])


In [None]:
# Plotting the loss
#plt.figure(figsize=(5,5))
#plt.plot(history.history['loss'])
#plt.title('Model Loss')
#plt.ylabel('Loss')
#plt.xlabel('Epoch')
#plt.legend(['Train'])
#plt.show()