## General Library Imports

In [None]:
#  Install the Tensorflow library using pip as the Python package manager.
!pip install tensorflow

In [None]:
!pip install keras-tuner

In [None]:
!pip install seaborn --upgrade

In [None]:
!pip install --user matplotlib==3.7.3

In [None]:
!pip install scikeras

In [None]:
# Import the necessary modules
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# ChatGPT - DL
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Deep learning libraries
import tensorflow as tf
import kerastuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, ZeroPadding1D
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model

from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score

from tensorflow.keras.layers import Input, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, concatenate
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import itertools

from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import GridSearchCV
from keras.optimizers import Adam
from keras.models import load_model

# libraries for models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

# Libraries for Metrics evaluation
from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve, RocCurveDisplay

## Data loading

In [None]:
project_data = pd.read_csv("./dataset/Employee Analysis Attrition Report/HR Employee Attrition.csv")

##  Analysis of the Data

In [None]:
project_data.head()

In [None]:
project_data.info()

In [None]:
project_data.describe()

In [None]:
project_data.shape

In [None]:
project_data.columns

In [None]:
project_data.isna().sum()

## Exploratory Data Analysis

### Univariate Analysis

In [None]:
numeric_columns = [column for column in project_data.columns if project_data[column].dtype == 'int64']
print(numeric_columns)

In [None]:
for column in numeric_columns:
    plt.figure(figsize=(3,2))
    sns.kdeplot(data=project_data, x=column, palette="crest")
    plt.show()

In [None]:
categorical_columns = [column for column in project_data.columns if project_data[column].dtype != 'int64']
print(categorical_columns)

In [None]:
for column in categorical_columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x=project_data[column])
    plt.show()

### Bivariate Analysis

In [None]:
for column in numeric_columns:
    plt.figure(figsize=(3,2))
    sns.kdeplot(data=project_data, x=column, hue="Attrition", fill=True, alpha=.5, palette="crest")
    plt.show()

## Correlation Analysis

In [None]:
df1 = project_data.copy()

encoder = LabelEncoder()
for column in categorical_columns:
    df1[column] = encoder.fit_transform(df1[column])

plt.figure(figsize=(30,12))
corr = df1.corr()
sns.heatmap(corr, annot=True, cmap="YlGnBu")

## Data Preprocessing and Pipelining

In [None]:
X_train=project_data.drop(columns=["Attrition"])
y_train=project_data["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [None]:
print('Train dataset shape:',X_train.shape)
print('Test dataset shape', X_test.shape)

print('Train dataset rows: ', len(X_train))
print('Test dataset rows: ', len(X_test))

In [None]:
numeric_columns = X_train.select_dtypes(exclude='object').columns
print(numeric_columns)
print('*'*100)
categorical_columns = X_train.select_dtypes(include='object').columns
print(categorical_columns)

In [None]:
numeric_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler(with_mean=True))
])

print(numeric_features)
print('*'*100)

categorical_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder()),
    ('scaling', StandardScaler(with_mean=False))
])

print(categorical_features)

processing = ColumnTransformer([
    ('numeric', numeric_features, numeric_columns),
    ('categorical', categorical_features, categorical_columns)
])

processing

## Generic Methods for Model Preparation & Metric Evaliation

In [None]:
def prepare_model_for_ml(name, algorithm):
    
    # model = Pipeline(steps=[
    #     ('processing', processing),  # Assuming you have defined preprocessing steps
    #     ('pca', TruncatedSVD(n_components=3, random_state=12))
    # ])

    model = Pipeline(steps= [
        ('processing',processing),
        ('pca', TruncatedSVD(n_components=3, random_state=12)),
        ('modeling', algorithm)
    ])
    model.fit(X_train, y_train)
    
    # Save
    filename = "Employee Analysis Attrition Report - "+ name +'.pkl'
    print(filename)
    # with open('C:/Users/saray/Documents/FYP Code Workspace/model/1. Employee Analysis Attrition Report', 'wb') as f:
    pickle.dump(model, open(filename, 'wb'))

    return model

In [None]:
def prepare_model_for_dl(name, algorithm):
    
    model = Pipeline(steps=[
        ('processing', processing),  # Assuming you have defined preprocessing steps
        ('pca', TruncatedSVD(n_components=3, random_state=12))
    ])
    
    # Save
    filename = "Employee Analysis Attrition Report - "+ name +'.pkl'
    print(filename)
    # with open('C:/Users/saray/Documents/FYP Code Workspace/model/1. Employee Analysis Attrition Report', 'wb') as f:
    pickle.dump(model, open(filename, 'wb'))

    return model

In [None]:
# Row 1100 - No
input_data1 = {
    "Age": [40],
    "BusinessTravel": ['Non-Travel'],
    "DailyRate": [1142],
    "Department": ['Research & Development'],
    "DistanceFromHome": [8],
    "Education": [2],
    "EducationField": ['Life Sciences'],
    "EmployeeCount": [1],
    "EmployeeNumber": [1552],
    "EnvironmentSatisfaction": [4],
    "Gender": ['Male'],
    "HourlyRate": [72],
    "JobInvolvement": [3],
    "JobLevel": [2],
    "JobRole": ['Healthcare Representative'],
    "JobSatisfaction": [4],
    "MaritalStatus": ['Divorced'],
    "MonthlyIncome": [4069],
    "MonthlyRate": [8841],
    "NumCompaniesWorked": [3],
    "Over18": ['Y'],
    "OverTime": ['Yes'],
    "PercentSalaryHike": [18],
    "PerformanceRating": [3],
    "RelationshipSatisfaction": [3],
    "StandardHours": [80],
    "StockOptionLevel": [0],
    "TotalWorkingYears": [8],
    "TrainingTimesLastYear": [2],
    "WorkLifeBalance": [3],
    "YearsAtCompany": [2],
    "YearsInCurrentRole": [2],
    "YearsSinceLastPromotion": [2],
    "YearsWithCurrManager": [2]
}

# Row 999 - Yes
input_data2 = {
    "Age": [27],
    "BusinessTravel": ['Travel_Rarely'],
    "DailyRate": [135],
    "Department": ['Research & Development'],
    "DistanceFromHome": [17],
    "Education": [4],
    "EducationField": ['Life Sciences'],
    "EmployeeCount": [1],
    "EmployeeNumber": [1405],
    "EnvironmentSatisfaction": [4],
    "Gender": ['Female'],
    "HourlyRate": [51],
    "JobInvolvement": [3],
    "JobLevel": [2],
    "JobRole": ['Research Scientist'],
    "JobSatisfaction": [3],
    "MaritalStatus": ['Single'],
    "MonthlyIncome": [2394],
    "MonthlyRate": [25681],
    "NumCompaniesWorked": [1],
    "Over18": ['Y'],
    "OverTime": ['Yes'],
    "PercentSalaryHike": [13],
    "PerformanceRating": [3],
    "RelationshipSatisfaction": [4],
    "StandardHours": [80],
    "StockOptionLevel": [0],
    "TotalWorkingYears": [8],
    "TrainingTimesLastYear": [2],
    "WorkLifeBalance": [3],
    "YearsAtCompany": [8],
    "YearsInCurrentRole": [2],
    "YearsSinceLastPromotion": [7],
    "YearsWithCurrManager": [7]
}

# Row 23 - Yes
input_data3 = {
    "Age": [36],
    "BusinessTravel": ['Travel_Rarely'],
    "DailyRate": [1218],
    "Department": ['Sales'],
    "DistanceFromHome": [9],
    "Education": [4],
    "EducationField": ['Life Sciences'],
    "EmployeeCount": [1],
    "EmployeeNumber": [27],
    "EnvironmentSatisfaction": [3],
    "Gender": ['Male'],
    "HourlyRate": [82],
    "JobInvolvement": [2],
    "JobLevel": [1],
    "JobRole": ['Sales Representative'],
    "JobSatisfaction": [1],
    "MaritalStatus": ['Single'],
    "MonthlyIncome": [3407],
    "MonthlyRate": [6986],
    "NumCompaniesWorked": [7],
    "Over18": ['Y'],
    "OverTime": ['No'],
    "PercentSalaryHike": [23],
    "PerformanceRating": [4],
    "RelationshipSatisfaction": [2],
    "StandardHours": [80],
    "StockOptionLevel": [0],
    "TotalWorkingYears": [10],
    "TrainingTimesLastYear": [4],
    "WorkLifeBalance": [3],
    "YearsAtCompany": [5],
    "YearsInCurrentRole": [3],
    "YearsSinceLastPromotion": [0],
    "YearsWithCurrManager": [3]
}

# Row 53 - Yes
input_data4 = {
    "Age": [28],
    "BusinessTravel": ['Travel_Rarely'],
    "DailyRate": [1434],
    "Department": ['Research & Development'],
    "DistanceFromHome": [5],
    "Education": [4],
    "EducationField": ['Technical Degree'],
    "EmployeeCount": [1],
    "EmployeeNumber": [65],
    "EnvironmentSatisfaction": [3],
    "Gender": ['Male'],
    "HourlyRate": [50],
    "JobInvolvement": [3],
    "JobLevel": [1],
    "JobRole": ['Laboratory Technician'],
    "JobSatisfaction": [3],
    "MaritalStatus": ['Single'],
    "MonthlyIncome": [3441],
    "MonthlyRate": [11179],
    "NumCompaniesWorked": [1],
    "Over18": ['Y'],
    "OverTime": ['Yes'],
    "PercentSalaryHike": [13],
    "PerformanceRating": [3],
    "RelationshipSatisfaction": [3],
    "StandardHours": [80],
    "StockOptionLevel": [0],
    "TotalWorkingYears": [2],
    "TrainingTimesLastYear": [3],
    "WorkLifeBalance": [2],
    "YearsAtCompany": [2],
    "YearsInCurrentRole": [2],
    "YearsSinceLastPromotion": [2],
    "YearsWithCurrManager": [2]
}

input_data_df1 = pd.DataFrame(input_data1)
input_data_df2 = pd.DataFrame(input_data2)
input_data_df3 = pd.DataFrame(input_data3)
input_data_df4 = pd.DataFrame(input_data4)

input_data = [input_data_df1, input_data_df2, input_data_df3, input_data_df4]

In [None]:
# def plot_confusion_matrix(cm, classes, model_name, title='Confusion matrix', cmap=plt.cm.Blues):
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)

#     fmt = 'd'
#     thresh = cm.max() / 2.
#     for i in range(cm.shape[0]):
#         for j in range(cm.shape[1]):
#             plt.text(j, i, format(cm[i, j], fmt),
#                      ha="center", va="center",
#                      color="white" if cm[i, j] > thresh else "black")
    
#     plt.tight_layout()
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')

#     print(model_name)
#     plt.show()

# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix for {model_name}')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def prepare_confusion_matrix(algo, models, X_test, y_test):
    plt.figure(figsize=(4, 3))
    plt.title(f'Confusion Matrix for {algo}')
    print('--------------------------------------------------------------------------------')

    for model in models:
        print("model: ", model)
        # model.summary()
        if model.name.startswith('sequential_'):
            y_pred = model.predict(X_test)  # Default for single-input models
        elif model.name.startswith('model_'):
            y_pred = model.predict([X_test, X_test])
        else:
            y_pred = model.predict(X_test)  # Default for single-input models

        # Adjust the threshold and create the confusion matrix
        y_pred_binary = (y_pred > 0.5).astype(int)
        cm = confusion_matrix(y_test, y_pred_binary)
        plot_confusion_matrix(cm, classes=['Negative', 'Positive'], model_name=f'{algo}')

In [None]:
def prepare_evaluation(model, model_name, X_test, y_test):
    plt.figure(figsize=(12, 4))

    # Initialize lists to store evaluation metrics
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []
    support_scores = []
    specificity_scores = []
    misclassification_rates = []

        
    # Make predictions
    if model.name.startswith('sequential'):
        y_pred = model.predict(X_test)  # Default for single-input models
        print('model starts with sequential')
    elif model.name.startswith('model'):
        y_pred = model.predict([X_test, X_test])
    # else:
    #     print("model name checked 3")
    #     y_pred = model.predict(X_test)  # Default for single-input models

    # Adjust the threshold and create the confusion matrix
    y_pred_binary = (y_pred > 0.5).astype(int)
    cm = confusion_matrix(y_test, y_pred_binary)
    accuracy_scores.append(accuracy_score(y_test, y_pred_binary))
    f1_scores.append(f1_score(y_test, y_pred_binary))
    precision_scores.append(precision_score(y_test, y_pred_binary))
    recall_scores.append(recall_score(y_test, y_pred_binary))
    misclassification_rate = 1 - accuracy_score(y_test, y_pred_binary)
    misclassification_rates.append(misclassification_rate)
    
    # Calculate support and specificity
    tn, fp, fn, tp = cm.ravel()
    support_scores.append(tp + fn)
    specificity_scores.append(tn / (tn + fp))

    # Plot confusion matrix
    # plt.subplot(1, len(models), i)
    plot_confusion_matrix(cm, classes=['Negative', 'Positive'], model_name=model_name)

    # Store metrics in dictionary
    ensemble_metrics[model_name] = {
        'Accuracy': np.mean(accuracy_scores),
        'F1 Score': np.mean(f1_scores),
        'Precision': np.mean(precision_scores),
        'Recall': np.mean(recall_scores),
        'Support': np.mean(support_scores),
        'Specificity': np.mean(specificity_scores),
        'Misclassification Rate': np.mean(misclassification_rates)

    }

In [None]:
# def prepare_classification_report(algo, model):
#     print(algo+' Report :')
# #     pred = model.predict(X_test)
# #     print(type(input_data_df))
#     pred_testrow1 = model.predict(input_data_df1)
#     pred_testrow2 = model.predict(input_data_df2)
#     pred_testrow3 = model.predict(input_data_df3)
#     pred_testrow4 = model.predict(input_data_df4)

# #     print(classification_report(y_test, pred))
#     print("My predictions: ", pred_testrow1,", ", pred_testrow2,", ", pred_testrow3,", ", pred_testrow4)

In [None]:
algorithms = [('bagging classifier', BaggingClassifier()), 
              ('KNN classifier', KNeighborsClassifier()), 
              ('Random Forest calssifier', RandomForestClassifier()), 
              ('Adaboost classifier', AdaBoostClassifier()), 
              ('Gradientboost classifier', GradientBoostingClassifier()),
              ('MLP', MLPClassifier())
             ]

trained_models = []
model_and_score = {}

for index, tup in enumerate(algorithms):
    model = prepare_model_for_ml(tup[0],tup[1])
#     model_and_score[tup[0]] = str(model.score(X_train,y_train)*100)+"%"
#     trained_models.append((tup[0],model))

    # model = Pipeline(steps=[
    #     ('processing', processing),  # Assuming you have defined preprocessing steps
    #     ('pca', TruncatedSVD(n_components=3, random_state=12))
    # ])
    

In [None]:
def prepare_roc_curve(model, model_name, X_test, y_test):
    plt.figure(figsize=(4, 3))
        
    print("Model: ", model_name)
    print()
    # model.summary()
    if model.name.startswith('sequential'):
        y_score = model.predict(X_test)  # Default for single-input models
    elif model.name.startswith('model'):
        y_score = model.predict([X_test, X_test])
    else:
        y_score = model.predict(X_test)  # Default for single-input models
        
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], linestyle='--', color='grey', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'AUC ROC Curve for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

## Model Preparation

In [None]:
# Changes - uncommented these 2 lines
# trained_models = []
# model_and_score = {}

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
def create_fnn(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32),
                    activation='relu', input_dim=input_dim))
    model.add(Dropout(rate=hp.Float('dropout_1', min_value=0.0, max_value=0.5, step=0.1)))
    
    model.add(Dense(units=hp.Int('units_2', min_value=32, max_value=256, step=32),
                    activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_2', min_value=0.0, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
def create_wide_and_deep_model(hp):
    input_dim = X_train.shape[1]

    wide_inputs = Input(shape=(input_dim,))
    deep_inputs = Input(shape=(input_dim,))

    wide_layer = Dense(units=hp.Int('wide_units', min_value=32, max_value=256, step=32),
                       activation='relu')(wide_inputs)

    deep_layer = Dense(units=hp.Int('deep_units_1', min_value=32, max_value=256, step=32),
                      activation='relu')(deep_inputs)
    deep_layer = Dense(units=hp.Int('deep_units_2', min_value=16, max_value=128, step=16),
                      activation='relu')(deep_layer)

    merged_layer = concatenate([wide_layer, deep_layer])

    output = Dense(1, activation='sigmoid')(merged_layer)

    model = Model(inputs=[wide_inputs, deep_inputs], outputs=output)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
model.named_steps['processing'].fit(X_train)

# Transform the training and testing data using the preprocessing steps
X_train_transformed = model.named_steps['processing'].transform(X_train)
X_test_transformed = model.named_steps['processing'].transform(X_test)

model.named_steps['pca'].fit(X_train_transformed)

# Perform dimensionality reduction using TruncatedSVD
X_train_svd = model.named_steps['pca'].transform(X_train_transformed)
X_test_svd = model.named_steps['pca'].transform(X_test_transformed)

In [None]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit label encoder and transform the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
def create_cnn_model(hp):
    input_shape = (X_train_reshaped.shape[1], X_train_reshaped.shape[2])
    
    model = Sequential()
    
    model.add(Conv1D(filters=hp.Int('filters', min_value=16, max_value=64, step=16),
                     kernel_size=hp.Int('kernel_size', min_value=2, max_value=5, step=1),
                     padding='same',
                     activation='relu',
                     input_shape=input_shape))
    
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
X_train_reshaped = np.expand_dims(X_train_svd, axis=2)
X_test_reshaped = np.expand_dims(X_test_svd, axis=2)

# Train the CNN Model
# cnn_model = train_cnn_model(X_train_reshaped, y_train_encoded, X_test_reshaped, y_test_encoded)

In [None]:
# Check the shape of the input arrays
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

# Check the content of the input arrays
print("Sample of X_train:")
print(X_train[:5])  # Print the first 5 rows of X_train to verify its content
print("Sample of y_train:")
print(y_train[:5])  # Print the first 5 elements of y_train to verify its content
print("Sample of X_test:")
print(X_test[:5])   # Print the first 5 rows of X_test to verify its content
print("Sample of y_test:")
print(y_test[:5])   # Print the first 5 elements of y_test to verify its content


In [None]:
X_train = X_train_svd
y_train = y_train_encoded
X_test = X_test_svd
y_test = y_test_encoded

# Check the content of the input arrays
print("Sample of X_train:")
print(X_train[:5])  # Print the first 5 rows of X_train to verify its content
print("Sample of y_train:")
print(y_train[:5])  # Print the first 5 elements of y_train to verify its content
print("Sample of X_test:")
print(X_test[:5])   # Print the first 5 rows of X_test to verify its content
print("Sample of y_test:")
print(y_test[:5])   # Print the first 5 elements of y_test to verify its content

print("-------- FNN ----------")

input_dim = X_train.shape[1]

fnn_tuner = kt.RandomSearch(
    create_fnn,
    objective='val_accuracy',
    max_trials=50,
    directory='my_dir',
    project_name='fnn_hyperparameter_tuning')

fnn_tuner.search(X_train, y_train, epochs=100, validation_data=(X_test, y_test))

best_fnn_model = fnn_tuner.get_best_models(num_models=1)[0]
best_fnn_hyperparameters = fnn_tuner.get_best_hyperparameters(num_trials=1)[0]

best_fnn_model.summary()

best_fnn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

best_fnn_accuracy = best_fnn_model.evaluate(X_test, y_test, verbose=0)[1]
print("Best FNN Model Accuracy:", best_fnn_accuracy)
print("Best Hyperparameters:", best_fnn_hyperparameters)

print("-------- Wide and Deep ----------")

wd_tuner = RandomSearch(
    create_wide_and_deep_model,
    objective='val_accuracy',
    max_trials=50,
    directory='my_dir',
    project_name='wide_and_deep_hyperparameter_tuning'
)

wd_tuner.search_space_summary()
wd_tuner.search([X_train, X_train], y_train, epochs=100, batch_size=32, validation_data=([X_test, X_test], y_test))

best_wd_model = wd_tuner.get_best_models(num_models=1)[0]
best_wd_hyperparameters = wd_tuner.get_best_hyperparameters(num_trials=1)[0]

best_wd_model.fit([X_train, X_train], y_train, epochs=100, batch_size=32, validation_data=([X_test, X_test], y_test))

best_wide_and_deep_accuracy = best_wd_model.evaluate([X_test, X_test], y_test, verbose=0)[1]
print("Best Wide & Deep Model Accuracy:", best_wide_and_deep_accuracy)

print("-------- CNN ----------")

X_train_reshaped = np.expand_dims(X_train_svd, axis=2)
X_test_reshaped = np.expand_dims(X_test_svd, axis=2)

print("X_train_reshaped shape:", X_train_reshaped.shape)
print("X_test_reshaped shape:", X_test_reshaped.shape)

cnn_tuner = kt.Hyperband(create_cnn_model,
                     objective='val_accuracy',
                     max_epochs=50,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

cnn_tuner.search(X_train_reshaped, y_train, epochs=100, validation_data=(X_test_reshaped, y_test))

best_cnn_hyperparameters=cnn_tuner.get_best_hyperparameters(num_trials=1)[0]

best_cnn_model = cnn_tuner.hypermodel.build(best_cnn_hyperparameters)
cnn_history = best_cnn_model.fit(X_train_reshaped, y_train, epochs=100, validation_data=(X_test_reshaped, y_test))

best_cnn_accuracy = cnn_history.history['val_accuracy'][-1]
print("Best CNN Model Accuracy:", best_cnn_accuracy)

models = [best_fnn_model, best_wd_model, best_cnn_model] 

print()
print()
print("Accuracy for FNN:", "{:.2%}".format(best_fnn_accuracy))
print("Accuracy for Wide and Deep:", "{:.2%}".format(best_wide_and_deep_accuracy))
print("Accuracy for CNN:", "{:.2%}".format(best_cnn_accuracy))
print()
print()

In [None]:
# # Use the same dataset for training all models
# X_train_all = X_train  # Assuming X_train is properly preprocessed

# # Split the data consistently
# X_train_fnn, X_train_wd, X_train_cnn = X_train, X_train, X_train_cnn

# # Fit each model with its corresponding training data
# best_fnn_model.fit(X_train_fnn, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
# best_wd_model.fit([X_train_wd, X_train_wd], y_train, epochs=100, batch_size=32, validation_data=([X_test, X_test], y_test))
# best_cnn_model.fit(X_train_cnn, y_train, epochs=100, batch_size=32, validation_data=(X_test_cnn, y_test))

# # Wrap each model with KerasClassifier/KerasRegressor
# fnn_classifier = KerasClassifier(build_fn=lambda: best_fnn_model, epochs=100, batch_size=32, verbose=0)
# wd_classifier = KerasClassifier(build_fn=lambda: best_wd_model, epochs=100, batch_size=32, verbose=0)
# cnn_classifier = KerasClassifier(build_fn=lambda: best_cnn_model, epochs=100, batch_size=32, verbose=0)

# # Define the voting classifier with the wrapped models
# voting_clf = VotingClassifier(
#     estimators=[('fnn', fnn_classifier), ('wd', wd_classifier), ('cnn', cnn_classifier)],
#     voting='soft'  # Use soft voting for probabilities
# )

# # Fit the voting classifier
# voting_clf.fit(X_train_all, y_train)

# # Evaluate the voting classifier on the test set
# voting_accuracy = voting_clf.score([X_test, X_test, X_test_cnn], y_test)
# print("Voting Classifier Accuracy:", voting_accuracy)


In [None]:
# keras_estimator_fnn = KerasClassifier(build_fn=create_fnn, hp=best_fnn_hyperparameters, epochs=100, batch_size=32, verbose=0)
# keras_estimator_wd = KerasClassifier(build_fn=create_wide_and_deep_model, hp=best_wd_hyperparameters, epochs=100, batch_size=32, verbose=0)
# keras_estimator_cnn = KerasClassifier(build_fn=create_cnn_model, hp=best_cnn_hyperparameters, epochs=100, batch_size=32, verbose=0)

# # Define a list of (name, estimator) tuples for the VotingClassifier
# estimators = [
#     ('fnn', keras_estimator_fnn),
#     # ('wide_and_deep', keras_estimator_wd),
#     ('cnn', keras_estimator_cnn)
# ]

# keras_estimator_fnn.fit(X_train, y_train)
# keras_estimator_cnn.fit(X_train, y_train)

# print("Shape of X_train:", X_train.shape)
# print("Shape of y_train:", y_train.shape)

# X_train_wd = [X_train, X_train]
# print("Shape of X_train_wd[0]:", X_train_wd[0].shape)
# print("Shape of X_train_wd[1]:", X_train_wd[1].shape)

# # Fit the wide and deep model
# # keras_estimator_wd.fit(X_train_wd, y_train)

# # Define a list of (name, estimator) tuples for the VotingClassifier
# estimators = [
#     ('fnn', keras_estimator_fnn),
#     # ('wide_and_deep', keras_estimator_wd),
#     ('cnn', keras_estimator_cnn)
# ]

# voting_classifier = VotingClassifier(estimators=estimators, voting='hard')

# # Fit the VotingClassifier on the training data
# voting_classifier.fit(X_train, y_train)

# # Evaluate the ensemble classifier
# ensemble_accuracy = voting_classifier.score(X_test, y_test)
# print("Ensemble Accuracy:", ensemble_accuracy)

In [None]:
# # y_train = y_train.replace({'Yes': 1, 'No': 0})
# # y_test = np.where(y_test == 'Yes', 1, 0)
# from sklearn.ensemble import VotingClassifier, StackingClassifier

# # Create a voting classifier with the trained models
# voting_classifier = VotingClassifier(estimators=models, voting='hard')

# # Fit the voting classifier on the training data
# voting_classifier.fit(X_train, y_train)

In [None]:
def evaluate_ensemble_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    support = np.sum(y_true)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    misclassification_rate = 1 - accuracy
    
    print(f"Evaluation Metrics for {model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Support: {support}")
    print(f"Specificity: {specificity}")
    print(f"Misclassification Rate: {misclassification_rate}")

In [None]:
# def create_metamodel_fnn(input_dim):
#     print(input_dim)
#     model = Sequential([
#         Dense(128, activation='relu', input_dim=input_dim),
#         Dropout(0.2),
#         Dense(64, activation='relu'),
#         Dropout(0.2),
#         Dense(1, activation='sigmoid')  # Use 'sigmoid' activation for binary classification
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
#     return model
from keras.callbacks import ReduceLROnPlateau
from keras import regularizers


def create_metamodel_fnn(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32),
                    activation='relu',
                    input_dim=input_dim))
    model.add(Dropout(rate=hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('units_2', min_value=32, max_value=256, step=32),
                    activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

def tune_metamodel(X_train, y_train):
    tuner = RandomSearch(
        create_metamodel_fnn,
        objective='val_accuracy',
        max_trials=150,
        executions_per_trial=3,
        directory='stacking_hyperparameter_tuning',
        project_name='stacking')

    tuner.search(X_train, y_train, epochs=100, validation_split=0.2)
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    return best_hps

def ensemble_predict(models, X_test, X_train, method):
    test_predictions = []
    train_predictions = []
    
    for model in models:
        if isinstance(model, Sequential):
            X_test_input = X_test 
            X_train_input = X_train
        elif isinstance(model, Model):
            X_test_input = (X_test, X_test)
            X_train_input = (X_train, X_train)
        else:
            raise ValueError("Invalid model type provided.")
        
        test_predictions.append(model.predict(X_test_input))
        train_predictions.append(model.predict(X_train_input))
    
    if method == 'simple_average':
        test_predictions = np.array(test_predictions)
        ensemble_predictions = np.round(np.mean(test_predictions, axis=0)).astype(int)
        evaluate_ensemble_model(y_test, ensemble_predictions, "Simple Average")
        return ensemble_predictions
    elif method == 'voting':
        test_predictions = np.array(test_predictions)
        num_samples = test_predictions.shape[1]
        num_models = test_predictions.shape[0]
        test_predictions = test_predictions.reshape(num_samples, num_models)
        class_votes = np.sum(test_predictions, axis=1)
        class_votes = np.expand_dims(class_votes, axis=1)
        final_prediction = np.where(class_votes >= 0.5, 1, 0)
        evaluate_ensemble_model(y_test, final_prediction, "Voting")
        return final_prediction
    elif method == 'stacking':
        meta_X_train = np.concatenate(train_predictions, axis=1)
        meta_X_test = np.concatenate(test_predictions, axis=1)

        best_hps = tune_metamodel(meta_X_train, y_train)
        print(best_hps)
        print("Best Hyperparameters:")
        for param, value in best_hps.values.items():
            print(f"{param}: {value}")
            
        meta_model = create_metamodel_fnn(best_hps)

        meta_model.fit(meta_X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
        model_name = 'Stacking'
        prepare_evaluation(meta_model, model_name, meta_X_test, y_test)
        df_ensemble_metrics = pd.DataFrame(ensemble_metrics).T
        print('df_ensemble_metrics')
        print(df_ensemble_metrics)
        print()
        meta_model.save("stacking_ensemble_model.h5")
        meta_predictions = meta_model.predict(meta_X_test)
        return np.round(meta_predictions).astype(int)
    else:
        raise ValueError("Invalid ensemble method provided.")

stacking_accuracy = 0
voting_accuracy = 0
simple_average_accuracy = 0
ensemble_accuracy = 0
ensemble_metrics = {}
meta_model = Sequential()
meta_X_test = []

ensemble_methods = ['stacking', 'voting', 'simple_average']

for i, method in enumerate(ensemble_methods, 1):

    print(f"\n-------- Ensemble Method: {method} ----------")
    ensemble_predictions = ensemble_predict(models, X_test, X_train, method)
    ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
    ensemble_metrics[method] = ensemble_accuracy
    print(f"Accuracy of {method} method: {ensemble_accuracy}")
    if method == 'stacking':
        stacking_accuracy = ensemble_accuracy
    elif method == 'voting':
        voting_accuracy = ensemble_accuracy
    elif method == 'simple_average':
        simple_average_accuracy = ensemble_accuracy


    print()
    print(f"Accuracy of {method} method: {ensemble_accuracy}")
    # print(f"Predictions for {method} method:", ensemble_predictions)  # Print predictions for each method
    ensemble_accuracy = 0

best_fnn_model.save("fnn_model.h5")
best_wd_model.save("wide_and_deep_model.h5")
best_cnn_model.save("cnn_model.h5")

print()
print()
print("Accuracy for Stacking:", "{:.2%}".format(stacking_accuracy))
print("Accuracy for Voting:", "{:.2%}".format(voting_accuracy))
print("Accuracy for Simple Average:", "{:.2%}".format(simple_average_accuracy))
print()
print()

## Model Evaluation

In [None]:
print(model_and_score)

In [None]:
print(models)

In [None]:
# Load trained models from pickle files
dl_ensemble_models = []

stacking_model = load_model("./stacking_ensemble_model.h5")
# simple_average_model = np.load("./simple_average_ensemble_model.npy")
# voting_model = np.load("./voting_ensemble_model.npy")

fnn_model = load_model("./fnn_model.h5")
wide_and_deep_model = load_model("./wide_and_deep_model.h5")
cnn_model = load_model("./cnn_model.h5")

# dl_base_models = [fnn_model, wide_and_deep_model, cnn_model]
models = [best_fnn_model, best_wd_model, best_cnn_model] 
dl_base_models = models
dl_ensemble_models = [meta_model]

In [None]:
# trained_models

dl_ensemble_models

In [None]:
dl_base_models

In [None]:
models

In [None]:
len(X_test_svd)

In [None]:
# Prepare Confusion Matrix

# Ensure y_test is encoded properly
encoder = LabelEncoder()
y_test_encoded = encoder.fit_transform(y_test)

# Define a function to prepare confusion matrix and evaluation metrics for base models
def prepare_evaluation(model, model_name, X_test, y_test):
    plt.figure(figsize=(12, 4))

    if model.name.startswith('sequential'):
        y_pred_proba = model.predict(X_test)  # Default for single-input models
    elif model.name.startswith('model'):
        y_pred_proba = model.predict([X_test, X_test])

    # Convert probabilities to binary predictions using a threshold of 0.5
    y_pred_binary = (y_pred_proba > 0.5).astype(int)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred_binary)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_binary)
    
    # Calculate other metrics
    f1 = f1_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    misclassification_rate = 1 - accuracy
    
    # Calculate specificity
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)

    # Update ensemble metrics dictionary
    ensemble_metrics[model_name] = {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall,
        'Support': tp + fn,
        'Specificity': specificity,
        'Misclassification Rate': misclassification_rate
    }

    # Plot confusion matrix
    plot_confusion_matrix(cm, classes=['Negative', 'Positive'], model_name=model_name)


df_base_model_metrics = []
ensemble_metrics = {}

# Call the function for base model evaluation
for i, base_model in enumerate(dl_base_models, 1):
    print(base_model)
    if i == 1:
        model_name = 'FNN'
    elif i == 2:
        model_name = 'Wide and Deep'
    elif i == 3:
        model_name = 'CNN'
    prepare_evaluation(base_model, model_name, X_test_svd, y_test_encoded)

# Print ensemble metrics as a table using pandas DataFrame
df_base_model_metrics = pd.DataFrame(ensemble_metrics).T
print()
print(df_base_model_metrics)
print()

In [None]:
# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix for {model_name}')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# ensemble_metrics = {}  

# print(dl_ensemble_models)
# for i, ensemble_model in enumerate(dl_ensemble_models, 1):
#     # print(ensemble_model)
#     if i == 1:
#         model_name = 'Stacking'
#     elif i == 2:
#         model_name = 'Voting'
#     elif i == 3:
#         model_name = 'Simple Average'
#     print(model_name)
#     prepare_evaluation(ensemble_model, model_name, meta_X_test, y_test)

# df_ensemble_metrics = pd.DataFrame(ensemble_metrics).T
# print()
# print(df_ensemble_metrics)
# print()

In [None]:
# for index, tup in enumerate(trained_models):
#     prepare_classification_report(tup[0], tup[1])
#     print("\n")

In [None]:
# Prepare ROC curve

# Ensure y_test is encoded properly
encoder = LabelEncoder()
y_test_encoded = encoder.fit_transform(y_test)

# Iterate over trained models and prepare ROC curves
for i, model in enumerate(models, 1):
    if i == 1:
        model_name = 'FNN'
    elif i == 2:
        model_name = 'Wide and Deep'
    elif i == 3:
        model_name = 'CNN'
    prepare_roc_curve(model, model_name, X_test_svd, y_test_encoded)


# for i, model in enumerate(dl_ensemble_models, 1):
#     print(model)
#     if i == 1:
#         model_name = 'Stacking'
#     elif i == 2:
#         model_name = 'Voting'
#     elif i == 3:
#         model_name = 'Simple Average'
#     prepare_roc_curve(model, model_name, X_test_svd, y_test_encoded)

In [None]:
# Prepare Classification Report

# Transform input data using preprocessing steps
input_data = [input_data_df1, input_data_df2, input_data_df3, input_data_df4]

input_data_transformed = [model.named_steps['processing'].transform(data) for data in input_data]
input_data_svd = [model.named_steps['pca'].transform(data_transformed) for data_transformed in input_data_transformed]

# Define a function to convert probabilities to Yes/No labels
def convert_to_yes_no(predictions):
    return ["Yes" if pred > 0.5 else "No" for pred in predictions]

# Make predictions using the deep learning ensemble models
ensemble_predictions = {
    'simple_avg': [],
    'voting': [],
    'stacking': []
}

# ensemble_predictions = {
#     'simple_avg': [np.mean([model.predict(svd) for model in dl_ensemble_models_simple_avg], axis=0) for svd in input_data_svd],
#     'voting': [np.mean([model.predict(svd) for model in dl_ensemble_models_voting], axis=0) for svd in input_data_svd],
#     'stacking': [np.mean([model.predict(svd) for model in dl_ensemble_models_stacking], axis=0) for svd in input_data_svd]
# }

# for model in dl_ensemble_models_simple_avg:
#     if isinstance(model, Sequential):  # Adjust 'CNNModel' to the actual class name of your CNN model
#         input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
#         predictions = model.predict(input_data_reshaped)
#     else:
#         predictions = [model.predict(svd) for svd in input_data_svd]
#     ensemble_predictions['simple_avg'].append(np.mean(predictions, axis=0))


# for model in dl_ensemble_models_voting:
#     if isinstance(model, Sequential):  # Adjust 'CNNModel' to the actual class name of your CNN model
#         input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
#         predictions = model.predict(input_data_reshaped)
#     else:
#         predictions = [model.predict(svd) for svd in input_data_svd]
#     ensemble_predictions['voting'].append(np.mean(predictions, axis=0))

# for model in dl_ensemble_models_stacking:
#     if isinstance(model, Sequential):  # Adjust 'CNNModel' to the actual class name of your CNN model
#         input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
#         predictions = model.predict(input_data_reshaped)
#     else:
#         predictions = [model.predict(svd) for svd in input_data_svd]
#     ensemble_predictions['stacking'].append(np.mean(predictions, axis=0))

for model in dl_ensemble_models_simple_avg:
    if isinstance(model, Sequential):
        input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
        predictions = model.predict(input_data_reshaped)
    else:
        predictions = [model.predict(svd) for svd in input_data_svd]
    ensemble_predictions['simple_avg'].append(predictions)

for model in dl_ensemble_models_voting:
    if isinstance(model, Sequential):
        input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
        predictions = model.predict(input_data_reshaped)
    else:
        predictions = [model.predict(svd) for svd in input_data_svd]
    ensemble_predictions['voting'].append(predictions)

for model in dl_ensemble_models_stacking:
    if isinstance(model, Sequential):
        input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
        predictions = model.predict(input_data_reshaped)
    else:
        predictions = [model.predict(svd) for svd in input_data_svd]
    ensemble_predictions['stacking'].append(predictions)

# Convert ensemble predictions to Yes/No labels
ensemble_labels = {
    'simple_avg': [convert_to_yes_no(predictions) for predictions in ensemble_predictions['simple_avg']],
    'voting': [convert_to_yes_no(predictions) for predictions in ensemble_predictions['voting']],
    'stacking': [convert_to_yes_no(predictions) for predictions in ensemble_predictions['stacking']]
}

# Display the predictions
for i, data in enumerate(input_data):
    print(f"\nPredictions for input_data{i+1}:")
    print("Simple Average Ensemble:", ensemble_labels['simple_avg'][i], " Chance of Leaving: ", ensemble_predictions['simple_avg'][i])
    print("Voting Ensemble:", ensemble_labels['voting'][i], " Chance of Leaving: ", ensemble_predictions['voting'][i])
    print("Stacking Ensemble:", ensemble_labels['stacking'][i], " Chance of Leaving: ", ensemble_predictions['stacking'][i])


In [None]:
# # Prepare your input data (input_data_df1, input_data_df2, input_data_df3, input_data_df4)
# input_data = [input_data_df1, input_data_df2, input_data_df3, input_data_df4]

# # Preprocess the input data to match the expected input shapes of the models
# # (You might need to adjust this based on your actual preprocessing steps)
# input_data_transformed = [model.named_steps['processing'].transform(data) for data in input_data]
# input_data_svd = [model.named_steps['pca'].transform(data_transformed) for data_transformed in input_data_transformed]

# # Define a function to convert probabilities to Yes/No labels
# def convert_to_yes_no(predictions):
#     return ["Yes" if pred > 0.5 else "No" for pred in predictions]

# # Make predictions using the deep learning ensemble models
# ensemble_predictions = {
#     'simple_avg': [],
#     'voting': [],
#     'stacking': []
# }

# # Iterate over each ensemble method
# for ensemble_method in dl_ensemble_models:
#     method_name = ensemble_method['method']
#     models = ensemble_method['models']
    
#     # Iterate over each model in the ensemble
#     for model in models:
#         if isinstance(model, Sequential):  
#             # Adjust input data shape for sequential models
#             input_data_reshaped = [np.expand_dims(svd, axis=2) for svd in input_data_svd]
#             predictions = model.predict(input_data_reshaped)
#         else:
#             predictions = [model.predict(svd) for svd in input_data_svd]
        
#         ensemble_predictions[method_name].append(predictions)

# # Convert ensemble predictions to Yes/No labels
# ensemble_labels = {
#     'simple_avg': [convert_to_yes_no(predictions) for predictions in ensemble_predictions['simple_avg']],
#     'voting': [convert_to_yes_no(predictions) for predictions in ensemble_predictions['voting']],
#     'stacking': [convert_to_yes_no(predictions) for predictions in ensemble_predictions['stacking']]
# }

# # Display the predictions
# for i, data in enumerate(input_data):
#     print(f"\nPredictions for input_data{i+1}:")
#     print("Simple Average Ensemble:", ensemble_labels['simple_avg'][i], " Chance of Leaving: ", ensemble_predictions['simple_avg'][i])
#     print("Voting Ensemble:", ensemble_labels['voting'][i], " Chance of Leaving: ", ensemble_predictions['voting'][i])
#     print("Stacking Ensemble:", ensemble_labels['stacking'][i], " Chance of Leaving: ", ensemble_predictions['stacking'][i])