#### **1. Import and Load Dataset**

In [1]:
# Importing necessary libraries
import xgboost as xgb  # For fast and effective gradient boosting, excelling in classification and regression tasks, especially with large datasets.
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating plots and visualizations
import numpy as np  # For numerical computations
import seaborn as sns  # For statistical data visualization
import tensorflow as tf  # For building and training neural networks
from sklearn.impute import KNNImputer  # For imputing missing values using KNN algorithm
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV  # For splitting data into train and test sets, and for hyperparameter tuning
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder  # For encoding categorical variables, scaling numerical features to a specified range, and imputing missing values
from sklearn.linear_model import LogisticRegression  # For building logistic regression model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report  # For evaluating model performance using various metrics
from ucimlrepo import fetch_ucirepo  # For fetching datasets from UCI Machine Learning Repository
from sklearn.svm import SVC  # For building Support Vector Machine models
from sklearn.ensemble import RandomForestClassifier  # For building Random Forest Classifier models
from sklearn.neural_network import MLPClassifier  # For building Multilayer Perceptron Classifier models
from tensorflow.keras.models import Sequential  # For building sequential neural network models
from tensorflow.keras.layers import Dense  # For adding dense layers to neural network models
from scikeras.wrappers import KerasClassifier  # For using Keras models with scikit-learn interface
from sklearn.compose import ColumnTransformer  # For applying transformations to different columns in the dataset
from scipy.stats import randint  # For generating random integer values

ModuleNotFoundError: No module named 'ucimlrepo'

In [None]:
from ucimlrepo import fetch_ucirepo  # For fetching datasets from the UCI Machine Learning Repository

# Fetch dataset
spambase = fetch_ucirepo(id=94)  # Fetch the Spambase dataset with ID 94

# Data (as pandas dataframes)
X = spambase.data.features  # Features of the Spambase dataset
y = spambase.data.targets  # Target labels of the Spambase dataset

# Metadata
print(spambase.metadata)  # Print metadata information about the Spambase dataset

# Variable information
print(spambase.variables)  # Print information about the variables/features in the Spambase dataset


In [None]:
df = pd.concat([X, y], axis=1, join='inner')  # Concatenate features and target labels along columns with inner join
display(df)  # Display the concatenated DataFrame

#### **2. EDA**

In [None]:
# Viewing the first few rows of the dataframe
print(df.head())  # Display the first few rows of the DataFrame

In [None]:
df.isna().sum()  # Check the number of missing values for each column in the DataFrame

In [None]:
# Viewing general information about the dataframe
print(df.info())  # Print general information about the DataFrame

In [None]:
# Viewing descriptive statistics of the dataframe
print(df.describe(include='all'))  # Print descriptive statistics of the DataFrame, including all columns

In [None]:
# Selecting numerical columns
numerical = df.select_dtypes(include=['float', 'int']).columns
# Identify and store the column names of numerical features in the DataFrame

In [None]:
# Extracting only the numerical columns from the DataFrame
numerical_df = df[numerical]

In [None]:
# Visualizing the distribution of each feature using scatter plots
for feature in numerical_df.columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(range(len(df)), df[feature], alpha=0.7)  # Scatter plot of the feature
    plt.title(f'Distribution of {feature}')  # Title of the plot
    plt.xlabel('Index')  # X-axis label
    plt.ylabel(feature)  # Y-axis label
    plt.show()  # Display the plot

#### **3. Feature Engineering**

**Prepare the features set and target variable**

In [None]:
# Select rows where the "Class" column contains NaN values
df[df["Class"].isna()]

In [None]:
# Calculate the proportion of missing values for each numerical column in the DataFrame
df[numerical].isna().mean()

In [None]:
# check data types in DataFrame
df.dtypes

In [None]:
# Generate descriptive statistics for the numerical columns in the DataFrame
df[numerical].describe()

In [None]:
# Extracting only the numerical columns from the DataFrame and storing it in a new DataFrame
df_numeric = df[numerical]

In [None]:
# Ensure the presence of the 'Class' column before proceeding
if 'Class' not in df.columns:
    raise KeyError("The 'Class' column is not present in the DataFrame")

# Select rows where the "Class" column contains NaN values
class_nan_rows = df[df["Class"].isna()]
print("Class NaN Rows:\n", class_nan_rows)

In [None]:
# Calculate the proportion of missing values for each numerical column in the DataFrame
numerical = df.select_dtypes(include=[np.number]).columns.tolist()
missing_proportion = df[numerical].isna().mean()
print("Missing Proportion:\n", missing_proportion)

In [None]:
# Generate descriptive statistics for the numerical columns in the DataFrame
print("Descriptive Statistics:\n", df[numerical].describe())

In [None]:
# Extracting only the numerical columns from the DataFrame and storing it in a new DataFrame
df_numeric = df[numerical].copy()


In [None]:
# Function to calculate outliers' bounds using Interquartile Range (IQR)
def calculate_outliers_bound(df: pd.DataFrame, col: str) -> (float, float):
    """
    Calculate the lower and upper bounds for outliers detection using Interquartile Range (IQR) method.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the numerical column.
        col (str): The name of the numerical column for which outliers bounds are calculated.

    Returns:
        (float, float): A tuple containing the lower and upper bounds for outliers detection.
    """
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return (lower_bound, upper_bound)


In [None]:
# Detect outliers using the Interquartile Range (IQR) method and replace them with NaN values
outliers_bound = {col: calculate_outliers_bound(df_numeric, col) for col in numerical}
for col in numerical:
    lower_bound, upper_bound = outliers_bound[col]
    df_numeric.loc[(df_numeric[col] < lower_bound) | (df_numeric[col] > upper_bound), col] = np.nan

# Check the number of missing values for each numerical column after outlier detection
print("Missing values before imputation:\n", df_numeric.isna().sum())



In [None]:
# Using KNNImputer to fill missing values
imputer = KNNImputer()
df_imputed_array = imputer.fit_transform(df_numeric)
df_imputed = pd.DataFrame(df_imputed_array, columns=numerical)

# Add the 'Class' column back to the imputed DataFrame
df_imputed['Class'] = df['Class'].values

# Checking the distribution of the target variable
frequencies = df_imputed['Class'].value_counts()
proportions = df_imputed['Class'].value_counts(normalize=True) * 100
print(frequencies)
print(proportions)

In [None]:
# Output the engineered DataFrame for further processing
print("Here's the engineered DataFrame:\n", df_imputed.head())


In [None]:
# Plotting the distribution of each numerical feature
for feature in numerical:
    plt.figure(figsize=(10, 6))
    plt.scatter(df_imputed.index, df_imputed[feature], alpha=0.7)
    plt.title(f'Distribution of {feature}')
    plt.xlabel('Index')
    plt.ylabel(feature)
    plt.show()

In [None]:
# Assigning the target variable
y = df_imputed["Class"]

# Removing the target column from the feature dataset
X = df_imputed.drop(["Class"], axis=1)

# Counting the frequency and proportion of each category
frequencies = y.value_counts()
proportions = y.value_counts(normalize=True) * 100
print(frequencies)
print(proportions)

In [None]:
# Creating a plot to visualize the distribution of categories
sns.countplot(x=y)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Class')
plt.show()

#### **4. Feature Scaling**

In [None]:

def scale_numerical_features(X_train, X_test, numerical_cols):
    """
    Scales numerical features using StandardScaler.

    Parameters:
    X_train (pd.DataFrame): Training features.
    X_test (pd.DataFrame): Testing features.
    numerical_cols (list): List of numerical column names to be scaled.

    Returns:
    pd.DataFrame: Scaled training features.
    pd.DataFrame: Scaled testing features.
    """
    # Initialize MinMaxScaler
    scaler = StandardScaler()

    # Fit scaler on training data and transform both training and testing data
    X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
    X_test_scaled = scaler.transform(X_test[numerical_cols])

    # Convert scaled arrays back to DataFrames
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train.index)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test.index)

   # Integrate scaled columns back into the original DataFrames
    # X_train[numerical_cols] = X_train_scaled_df
    # X_test[numerical_cols] = X_test_scaled_df

    return X_train_scaled_df, X_test_scaled_df

In [None]:
# Ensure that numerical columns are selected for modeling
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
X_train_scaled_df, X_test_scaled_df = scale_numerical_features(X_train, X_test, numerical_cols)

# Verify the scaled features
print("Scaled training data:\n", X_train_scaled_df.head())
print("Scaled testing data:\n", X_test_scaled_df.head())


#### **5. Model Training**

In [None]:
# Create an empty DataFrame with an index representing evaluation metrics
df_final_test = pd.DataFrame(index=['Precision', 'Recall', 'F1-score', 'Accuracy'])

##### **Random Forest**

**Without Features Importance**

In [None]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Define a refined parameter distribution
rf_param_dist_refined = {
    'n_estimators': randint(150, 250),
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': randint(2, 6),
    'min_samples_leaf': randint(1, 3),
    'bootstrap': [False],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': [None]
}

# Re-initialize and run RandomizedSearchCV with refined parameters
rf_random_search_refined = RandomizedSearchCV(estimator=rf_model,
                                              param_distributions=rf_param_dist_refined,
                                              n_iter=100,  # Increase the number of iterations
                                              cv=5,
                                              n_jobs=-1,
                                              verbose=2,
                                              random_state=42)
rf_random_search_refined.fit(X_train_scaled_df, y_train)

# Best parameters and model evaluation
print("Best parameters for Random Forest (Refined Randomized Search):", rf_random_search_refined.best_params_)
rf_best_model_refined = rf_random_search_refined.best_estimator_
y_pred_rf_refined = rf_best_model_refined.predict(X_test_scaled_df)
print("\nRandom Forest (Refined Randomized Search) Classification Report:")
print(classification_report(y_test, y_pred_rf_refined))
print("Random Forest (Refined Randomized Search) Accuracy:", accuracy_score(y_test, y_pred_rf_refined))

In [None]:
# Calculating evaluation metrics for the Random Forest model tuned on all features
accuracy_rf_reduced = accuracy_score(y_test, y_pred_rf_refined)  # Calculating accuracy
precision_rf_reduced = precision_score(y_test, y_pred_rf_refined)  # Calculating precision
recall_rf_reduced = recall_score(y_test, y_pred_rf_refined)  # Calculating recall
f1_rf_reduced = f1_score(y_test, y_pred_rf_refined)  # Calculating F1 score

# Printing evaluation metrics
print("Random Forest Accuracy with All Features (Tuned):", accuracy_rf_reduced)
print("Random Forest Precision with All Features (Tuned):", precision_rf_reduced)
print("Random Forest Recall with All Features (Tuned):", recall_rf_reduced)
print("Random Forest F1 Score with All Features (Tuned):", f1_rf_reduced)

# Appending evaluation metrics to 'df_final_test' for visualization
df_final_test["Random Forest with All Features (Tuned)"] = [accuracy_rf_reduced, precision_rf_reduced, recall_rf_reduced, f1_rf_reduced]


**Feature Importance**

In [None]:
# Define a more comprehensive parameter grid
rf_param_dist_extended = {
    'n_estimators': randint(100, 1000),
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2']
}

# Initialize and run RandomizedSearchCV with more iterations
rf_model = RandomForestClassifier(random_state=42)
rf_random_search_extended = RandomizedSearchCV(estimator=rf_model,
                                               param_distributions=rf_param_dist_extended,
                                               n_iter=200,
                                               cv=5,
                                               n_jobs=-1,
                                               verbose=2,
                                               random_state=42)
rf_random_search_extended.fit(X_train_scaled_df, y_train)

# Best parameters and model evaluation
rf_best_model_randomized_extended = rf_random_search_extended.best_estimator_

# Feature Importance
rf_feature_importances = rf_best_model_randomized_extended.feature_importances_
sorted_idx = np.argsort(rf_feature_importances)[::-1]

# Select top features
X_train_important = X_train_scaled_df.iloc[:, sorted_idx[:10]]
X_test_important = X_test_scaled_df.iloc[:, sorted_idx[:10]]

# Train and evaluate using top features
rf_best_model_randomized_extended.fit(X_train_important, y_train)
y_pred_rf = rf_best_model_randomized_extended.predict(X_test_important)

print("\nRandom Forest with Important Features Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest with Important Features Accuracy:", accuracy_score(y_test, y_pred_rf))


In [None]:
# Calculate evaluation metrics for Random Forest with top features
accuracy_rf_important = accuracy_score(y_test, y_pred_rf)  # Calculating accuracy
precision_rf_important = precision_score(y_test, y_pred_rf)  # Calculating precision
recall_rf_important = recall_score(y_test, y_pred_rf)  # Calculating recall
f1_rf_important = f1_score(y_test, y_pred_rf)  # Calculating F1 score

# Printing evaluation metrics
print("Random Forest Accuracy with Top Features (Tuned):", accuracy_rf_important)
print("Random Forest Precision with Top Features (Tuned):", precision_rf_important)
print("Random Forest Recall with Top Features (Tuned):", recall_rf_important)
print("Random Forest F1 Score with Top Features (Tuned):", f1_rf_important)

# Appending evaluation metrics to a dataframe for visualization
df_final_test["Random Forest (Top Features, Tuned)"] = [accuracy_rf_important, precision_rf_important, recall_rf_important, f1_rf_important]


In [None]:
# Creating a DataFrame for feature importances
feature_names = X_train_scaled_df.columns  # Assuming X_train_scaled_df is a DataFrame with column names
feature_importances_df = pd.DataFrame({
    'Feature': feature_names[sorted_idx],
    'Importance': rf_feature_importances[sorted_idx]
})

print("Feature Importances for Random Forest:\n", feature_importances_df)

##### **XGBoost**

**Without Feature Importance**

In [None]:
# Convert DataFrames to NumPy arrays
X_train_scaled = X_train_scaled_df.values
X_test_scaled = X_test_scaled_df.values

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'max_depth': [3, 6, 10],  # Maximum tree depth for base learners
    'learning_rate': [0.01, 0.1, 0.2],  # Boosting learning rate
    'subsample': [0.8, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 1.0]  # Subsample ratio of columns when constructing each tree
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)  # Initialize XGBoost model

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)  # Initialize GridSearchCV with 5-fold cross-validation

# Train the model using GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Display the best parameters
print(f"Best parameters: {grid_search.best_params_}")  # Print the best parameters found by GridSearchCV
print(f"Best XGBoost Validation Accuracy: {grid_search.best_score_}")  # Print the best validation accuracy obtained by GridSearchCV

# Use the best model for prediction
best_xgb_model = grid_search.best_estimator_  # Select the best model found by GridSearchCV
y_pred_xgb_tuning = best_xgb_model.predict(X_test_scaled)  # Make predictions using the best model

# Model evaluation
accuracy = accuracy_score(y_test, y_pred_xgb_tuning)  # Calculate the accuracy
print(f"XGBoost Accuracy: {accuracy}")  # Print the accuracy
print("Classification Report:")  # Print the classification report
print(classification_report(y_test, y_pred_xgb_tuning))  # Print the classification report


In [None]:
# Calculate evaluation metrics for XGBoost with all features tuned
accuracy_xgb = accuracy_score(y_test, y_pred_xgb_tuning)  # Calculating accuracy
precision_xgb = precision_score(y_test, y_pred_xgb_tuning)  # Calculating precision
recall_xgb = recall_score(y_test, y_pred_xgb_tuning)  # Calculating recall
f1_xgb = f1_score(y_test, y_pred_xgb_tuning)  # Calculating F1 score

# Printing evaluation metrics
print("XGBoost Accuracy with All Features (Tuned):", accuracy_xgb)
print("XGBoost Precision with All Features (Tuned):", precision_xgb)
print("XGBoost Recall with All Features (Tuned):", recall_xgb)
print("XGBoost F1 Score with All Features (Tuned):", f1_xgb)

# Appending evaluation metrics to DataFrame for visualization
df_final_test["XGBOOST with All Features (Tuned)"] = [accuracy_xgb, precision_xgb, recall_xgb, f1_xgb]


**Feature Importance**

In [None]:
# Train an initial model to get feature importances
initial_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
initial_model.fit(X_train_scaled, y_train)

# Get feature importances from the initial model
feature_importances = initial_model.feature_importances_
feature_names = X_train_scaled_df.columns

# Create a DataFrame to hold feature importances
importance_xgb_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_xgb_df = importance_xgb_df.sort_values(by='Importance', ascending=False)

# Select top important features
xgb_model_top_features = importance_xgb_df.head(10)['Feature'].tolist()
print(f"Top Important Features: {xgb_model_top_features}")

# Define X_train_top_features and X_test_top_features using top important features
X_train_top_features = X_train_scaled_df[xgb_model_top_features]
X_test_top_features = X_test_scaled_df[xgb_model_top_features]

# Define a smaller parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Train the model using GridSearchCV
grid_search.fit(X_train_top_features, y_train)

# Display the best parameters and best score
print(f"Best parameters (Top Features): {grid_search.best_params_}")
print(f"Best XGBoost Validation Accuracy (Top Features): {grid_search.best_score_}")

# Use the best model for prediction
best_xgb_model = grid_search.best_estimator_
y_pred_xgb = best_xgb_model.predict(X_test_top_features)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy with Top Features (Tuned): {accuracy}")
print("Classification Report with Top Features (Tuned):")
print(classification_report(y_test, y_pred_xgb))


In [None]:
# Calculate performance metrics for XGBoost with top features
accuracy_xgb_top = accuracy_score(y_test, y_pred_xgb)  # Calculating accuracy
precision_xgb_top = precision_score(y_test, y_pred_xgb)  # Calculating precision
recall_xgb_top = recall_score(y_test, y_pred_xgb)  # Calculating recall
f1_xgb_top = f1_score(y_test, y_pred_xgb)  # Calculating F1 score

# Print the performance metrics
print("XGBoost Accuracy with Top Features (Tuned):", accuracy_xgb_top)
print("XGBoost Precision with Top Features (Tuned):", precision_xgb_top)
print("XGBoost Recall with Top Features (Tuned):", recall_xgb_top)
print("XGBoost F1 Score with Top Features (Tuned):", f1_xgb_top)

# Append performance metrics to df_final_test DataFrame
df_final_test["XGBOOST (Top Features, Tuned)"] = [accuracy_xgb_top, precision_xgb_top, recall_xgb_top, f1_xgb_top]


In [None]:
# Create a DataFrame to hold feature importances
importance_xgb_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_xgb_df = importance_xgb_df.sort_values(by='Importance', ascending=False)

# Display the DataFrame with feature importances
print("Feature Importances DataFrame:\n", importance_xgb_df)

##### **Logistic Regression**

**Without Feature Importance**

In [None]:
# Initialize Logistic Regression model
logreg_model = LogisticRegression()

# Train the model on the training data
logreg_model.fit(X_train_scaled_df, y_train)

# Predict on the test data
y_pred_logreg = logreg_model.predict(X_test_scaled_df)

# Define parameter grid for GridSearchCV
param_grid_lr = {
    'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization penalty
    'C': [0.01, 0.1, 1, 10],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers that support L1 and elasticnet regularization
    'max_iter': [100, 200, 300],  # Maximum number of iterations
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  # Mixing parameter for elasticnet
}

# Initialize Logistic Regression model
lr_model = LogisticRegression(random_state=42)

# Filter out invalid combinations of penalty and solver
valid_param_grid_lr = []
for penalty in param_grid_lr['penalty']:
    for solver in param_grid_lr['solver']:
        if (penalty == 'elasticnet' and solver != 'saga'):
            continue
        if penalty == 'elasticnet':
            for l1_ratio in param_grid_lr['l1_ratio']:
                valid_param_grid_lr.append({'penalty': [penalty], 'C': [C], 'solver': [solver], 'max_iter': [max_iter], 'l1_ratio': [l1_ratio]})
        else:
            for C in param_grid_lr['C']:
                for max_iter in param_grid_lr['max_iter']:
                    valid_param_grid_lr.append({'penalty': [penalty], 'C': [C], 'solver': [solver], 'max_iter': [max_iter]})

# Initialize GridSearchCV with the valid parameter grid
grid_search_lr = GridSearchCV(estimator=lr_model, param_grid=valid_param_grid_lr, cv=5, n_jobs=-1, verbose=2, error_score='raise')

# Train the model using GridSearchCV
grid_search_lr.fit(X_train_scaled_df, y_train)

# Display the best parameters
print(f"Best parameters: {grid_search_lr.best_params_}")
print(f"Best Logistic Regression Validation Accuracy: {grid_search_lr.best_score_}")

# Use the best model for prediction
best_lr_model = grid_search_lr.best_estimator_
y_pred_lr_tuning = best_lr_model.predict(X_test_scaled_df)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_lr_tuning)
print(f"Logistic Regression Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr_tuning))

In [None]:
# Calculate performance metrics for Logistic Regression with all features tuned
accuracy_lr_tuning = accuracy_score(y_test, y_pred_lr_tuning)  # Calculating accuracy
precision_lr_tuning = precision_score(y_test, y_pred_lr_tuning)  # Calculating precision
recall_lr_tuning = recall_score(y_test, y_pred_lr_tuning)  # Calculating recall
f1_lr_tuning = f1_score(y_test, y_pred_lr_tuning)  # Calculating F1 score

# Print the performance metrics
print("Logistic Regression Accuracy with All Features (Tuned):", accuracy_lr_tuning)
print("Logistic Regression Precision with All Features (Tuned):", precision_lr_tuning)
print("Logistic Regression Recall with All Features (Tuned):", recall_lr_tuning)
print("Logistic Regression F1 Score with All Features (Tuned):", f1_lr_tuning)

# Append performance metrics to df_final_test DataFrame
df_final_test["Logistic Regression with All Features (Tuned)"] = [accuracy_lr_tuning, precision_lr_tuning, recall_lr_tuning, f1_lr_tuning]


**Feature Importance**

In [None]:
def get_top_features(X_train_scaled_df, y_train):
    # Train an initial model to get feature importances
    initial_lr_model = LogisticRegression(random_state=42)
    initial_lr_model.fit(X_train_scaled_df, y_train)

    # Get feature importances (coefficients) from the initial model
    feature_importances_lr = abs(initial_lr_model.coef_[0])
    feature_names = X_train_scaled_df.columns

    # Create a DataFrame to hold feature importances
    importance_lr_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances_lr})

    # Sort features by importance
    importance_lr_df = importance_lr_df.sort_values(by='Importance', ascending=False)

    # Select top important features
    lr_model_top_features = importance_lr_df.head(10)['Feature'].tolist()
    return lr_model_top_features, importance_lr_df


# Example usage:
# Assuming X_train_scaled_df and y_train are already defined
top_features, importance_lr_df = get_top_features(X_train_scaled_df, y_train)

# Display the DataFrame with feature importances
print("DataFrame with Feature Importances:")
print(importance_lr_df.to_string(index=False))  # Print the DataFrame with feature importances


In [None]:
# Get top features using the previously defined function
top_features, _ = get_top_features(X_train_scaled_df, y_train)

# Select only the top features for training and testing datasets
X_train_top_features_lr = X_train_scaled_df[top_features]
X_test_top_features_lr = X_test_scaled_df[top_features]

# Define a smaller parameter grid for GridSearchCV
param_grid_lr_top = {
    'penalty': ['l1', 'l2'],  # Regularization penalty
    'C': [0.01, 0.1, 1, 10],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers that support L1 regularization
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

# Initialize Logistic Regression model
lr_model_top = LogisticRegression(random_state=42, penalty='none')  # Initialize Logistic Regression model without regularization

# Initialize GridSearchCV
grid_search_lr_top = GridSearchCV(estimator=lr_model_top, param_grid=param_grid_lr_top, cv=5, n_jobs=-1, verbose=2, error_score='raise')  # Initialize GridSearchCV with 5-fold cross-validation

# Train the model using GridSearchCV
grid_search_lr_top.fit(X_train_top_features_lr, y_train)

# Display the best parameters and best score
print(f"Best parameters (Top Features): {grid_search_lr_top.best_params_}")  # Print the best parameters found by GridSearchCV
print(f"Best Logistic Regression Validation Accuracy (Top Features): {grid_search_lr_top.best_score_}")  # Print the best validation accuracy obtained by GridSearchCV

# Use the best model for prediction
best_lr_model_top = grid_search_lr_top.best_estimator_  # Select the best model found by GridSearchCV
y_pred_lr_top = best_lr_model_top.predict(X_test_top_features_lr)  # Make predictions using the best model

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_lr_top)  # Calculate the accuracy
print(f"Logistic Regression Accuracy with Top Features (Tuned): {accuracy}")  # Print the accuracy
print("Classification Report with Top Features (Tuned):")  # Print the classification report header
print(classification_report(y_test, y_pred_lr_top))  # Print the classification report


In [None]:
# Calculate performance metrics for Logistic Regression with top features tuned
accuracy_lr_top_tuning = accuracy_score(y_test, y_pred_lr_top)  # Calculating accuracy
precision_lr_top_tuning = precision_score(y_test, y_pred_lr_top)  # Calculating precision
recall_lr_top_tuning = recall_score(y_test, y_pred_lr_top)  # Calculating recall
f1_lr_top_tuning = f1_score(y_test, y_pred_lr_top)  # Calculating F1 score

# Print the performance metrics
print("Logistic Regression Accuracy with Top Features (Tuned):", accuracy_lr_top_tuning)
print("Logistic Regression Precision with Top Features (Tuned):", precision_lr_top_tuning)
print("Logistic Regression Recall with Top Features (Tuned):", recall_lr_top_tuning)
print("Logistic Regression F1 Score with Top Features (Tuned):", f1_lr_top_tuning)

# Append performance metrics to df_final_test DataFrame
df_final_test["Logistic Regression (Top Features, Tuned)"] = [accuracy_lr_top_tuning, precision_lr_top_tuning, recall_lr_top_tuning, f1_lr_top_tuning]


##### **Neural Network**

**Without Feature Importance**

In [None]:
# Define a function to create the NN model, required for KerasClassifier
def create_model(optimizer='adam', init='uniform'):
    model = Sequential()
    model.add(Dense(64, input_shape=(X_train_scaled_df.shape[1],), kernel_initializer=init, activation='relu'))  # Input layer with 64 neurons, ReLU activation function
    model.add(Dense(32, kernel_initializer=init, activation='relu'))  # Hidden layer with 32 neurons, ReLU activation function
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))  # Output layer with 1 neuron, sigmoid activation function for binary classification
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])  # Compile the model with binary crossentropy loss function and accuracy metric
    return model

# Filter training and testing sets to use all features
X_train_all_features_nn = X_train_scaled_df
X_test_all_features_nn = X_test_scaled_df

# Create the KerasClassifier for GridSearchCV with all features
model_all_features = KerasClassifier(
    build_fn=create_model,
    optimizer='adam',
    init='uniform',
    verbose=0
)

# Define the parameter grid for GridSearchCV
param_grid = {
    'batch_size': [10, 20, 40],  # Batch size for training
    'epochs': [10, 50, 100],  # Number of epochs for training
    'optimizer': ['adam', 'rmsprop'],  # Optimizers to use
    'init': ['uniform', 'normal', 'glorot_uniform']  # Weight initialization methods
}

# Initialize GridSearchCV for all features
grid_search_all_features = GridSearchCV(estimator=model_all_features, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Train the model using GridSearchCV with all features
grid_search_all_features.fit(X_train_all_features_nn, y_train)

# Display the best parameters and best score
print(f"Best parameters (All Features): {grid_search_all_features.best_params_}")  # Print the best parameters found by GridSearchCV
print(f"Best Neural Network Validation Accuracy (All Features): {grid_search_all_features.best_score_}")  # Print the best validation accuracy obtained by GridSearchCV

# Use the best model for prediction with all features
best_nn_model_all = grid_search_all_features.best_estimator_  # Select the best model found by GridSearchCV
y_pred_nn_all = (best_nn_model_all.predict(X_test_all_features_nn) > 0.5).astype("int32")  # Make predictions on the test set using a threshold of 0.5 for binary classification

# Evaluate the model with all features
accuracy_nn_all = accuracy_score(y_test, y_pred_nn_all)  # Calculate the accuracy
print(f"Neural Network Accuracy with All Features (Tuned): {accuracy_nn_all}")  # Print the accuracy
print("Classification Report with All Features (Tuned):")  # Print the classification report header
print(classification_report(y_test, y_pred_nn_all))  # Print the classification report

In [None]:
# Calculate performance metrics for Neural Network with all features tuned
accuracy_nn_all_tuned = accuracy_score(y_test, y_pred_nn_all)  # Calculating accuracy
precision_nn_all_tuned = precision_score(y_test, y_pred_nn_all)  # Calculating precision
recall_nn_all_tuned = recall_score(y_test, y_pred_nn_all)  # Calculating recall
f1_nn_all_tuned = f1_score(y_test, y_pred_nn_all)  # Calculating F1 score

# Print the performance metrics
print("Neural Network Accuracy with All Features (Tuned):", accuracy_nn_all_tuned)
print("Neural Network Precision with All Features (Tuned):", precision_nn_all_tuned)
print("Neural Network Recall with All Features (Tuned):", recall_nn_all_tuned)
print("Neural Network F1 Score with All Features (Tuned):", f1_nn_all_tuned)

# Append performance metrics to df_final_test DataFrame
df_final_test["Neural Network (All Features, Tuned)"] = [accuracy_nn_all_tuned, precision_nn_all_tuned, recall_nn_all_tuned, f1_nn_all_tuned]


**Feature Importance**

In [None]:
# Train an initial model to get feature importances
initial_model = Sequential([
    Dense(64, input_shape=(X_train_scaled_df.shape[1],), activation='relu'),  # Input layer with 64 neurons, ReLU activation function
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons, ReLU activation function
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron, sigmoid activation function for binary classification
])

initial_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Compile the model with binary crossentropy loss function and accuracy metric
initial_model.fit(X_train_scaled_df, y_train, epochs=10, batch_size=32, verbose=1)  # Train the model with 10 epochs and batch size of 32

# Get feature importances from the initial model (using weights from the input layer)
feature_importances_nn = abs(initial_model.layers[0].get_weights()[0]).sum(axis=1)  # Extract weights from the input layer and sum them along rows to get feature importances
feature_names = X_train_scaled_df.columns  # Get feature names

# Create a DataFrame to hold feature importances
importance_nn_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances_nn})

# Sort features by importance
importance_nn_df = importance_nn_df.sort_values(by='Importance', ascending=False)

# Select top important features (e.g., top 10)
nn_model_top_features = importance_nn_df.head(10)['Feature'].tolist()
print(f"Top Important Features: {nn_model_top_features}")

# Display the DataFrame with feature importances
print(importance_nn_df)

In [None]:
# Define a function to create the NN model, required for KerasClassifier
def create_model(optimizer='adam', init='uniform', input_dim=None):
    model = Sequential()
    model.add(Dense(64, input_shape=(input_dim,), kernel_initializer=init, activation='relu'))  # Input layer with 64 neurons, ReLU activation function
    model.add(Dense(32, kernel_initializer=init, activation='relu'))  # Hidden layer with 32 neurons, ReLU activation function
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))  # Output layer with 1 neuron, sigmoid activation function for binary classification
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])  # Compile the model with binary crossentropy loss function and accuracy metric
    return model

# Filter training and testing sets to use only top features
X_train_top_features_nn = X_train_scaled_df[nn_model_top_features]
X_test_top_features_nn = X_test_scaled_df[nn_model_top_features]

# Create the KerasClassifier for GridSearchCV with top features
model_top_features = KerasClassifier(
    model=create_model,
    optimizer='adam',
    init='uniform',
    input_dim=X_train_top_features_nn.shape[1],
    verbose=0
)

# Define the parameter grid for GridSearchCV
param_grid = {
    'batch_size': [10, 20, 40],
    'epochs': [10, 50, 100],
    'optimizer': ['adam', 'rmsprop'],
    'init': ['uniform', 'normal', 'glorot_uniform']
}

# Initialize GridSearchCV for top features
grid_search_top_features = GridSearchCV(estimator=model_top_features, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Train the model using GridSearchCV with top features
grid_search_top_features.fit(X_train_top_features_nn, y_train)

# Display the best parameters and best score
print(f"Best parameters (Top Features): {grid_search_top_features.best_params_}")
print(f"Best Neural Network Validation Accuracy (Top Features): {grid_search_top_features.best_score_}")

# Use the best model for prediction with top features
best_nn_model_top = grid_search_top_features.best_estimator_
y_pred_nn_top = (best_nn_model_top.predict(X_test_top_features_nn) > 0.5).astype("int32")

# Evaluate the model with top features
accuracy_nn_top = accuracy_score(y_test, y_pred_nn_top)
print(f"Neural Network Accuracy with Top Features (Tuned): {accuracy_nn_top}")
print("Classification Report with Top Features (Tuned):")
print(classification_report(y_test, y_pred_nn_top))

In [None]:
# Calculate performance metrics for Neural Network with top features tuned
accuracy_nn_top_tuned = accuracy_score(y_test, y_pred_nn_top)  # Calculating accuracy
precision_nn_top_tuned = precision_score(y_test, y_pred_nn_top)  # Calculating precision
recall_nn_top_tuned = recall_score(y_test, y_pred_nn_top)  # Calculating recall
f1_nn_top_tuned = f1_score(y_test, y_pred_nn_top)  # Calculating F1 score

# Print the performance metrics
print("Neural Network Accuracy with Top Features (Tuned):", accuracy_nn_top_tuned)
print("Neural Network Precision with Top Features (Tuned):", precision_nn_top_tuned)
print("Neural Network Recall with Top Features (Tuned):", recall_nn_top_tuned)
print("Neural Network F1 Score with Top Features (Tuned):", f1_nn_top_tuned)

# Append performance metrics to df_final_test DataFrame
df_final_test["Neural Network (Top Features, Tuned)"] = [accuracy_nn_top_tuned, precision_nn_top_tuned, recall_nn_top_tuned, f1_nn_top_tuned]


#### **06. Metric Evaluation**

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_xgb_tuning)
print("Confusion Matrix:")  # Print the confusion matrix header
print(conf_matrix)  # Print the confusion matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_xgb_tuning)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

#### **7. Confusion Matrix Per Best Model**

In [None]:
# Create a figure with a size of 10x8 inches
plt.figure(figsize=(10, 8))

# Plot a heatmap of the DataFrame 'df_final_test' with annotations formatted to four decimal places
# using the 'coolwarm' colormap, and with annotation text size set to 8
sns.heatmap(df_final_test, annot=True, fmt=".4f", cmap="coolwarm", annot_kws={"size": 8})

# Set the title of the plot
plt.title("Confusion Matrix Per Best Model", fontsize=16)

# Adjust layout to prevent clipping of the title or axis labels
plt.tight_layout()

# Display the plot
plt.show()