In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc


# Business Objective
**The primary objective is to identify the most effective classification model for predicting customer subscription to bank products based on historical campaign data. This will enable the bank to focus its marketing resources on the most promising leads, thereby maximizing the return on investment for marketing activities.**

In [None]:
df = pd.read_csv('./data/bank-additional-full.csv', sep = ';')

# Analyze quality of the data and clean up data

 * Will generate two data frame df and df_cleaned. df is the original data with droppong record and imputation. df_cleaned is the data frame with data clean up.
 * Will apply model to both to confirm if data clean up have any impact on the feature engineering and modeling accuracy.

In [None]:
# Checking for missing values
missing_values = df.isnull().sum()

# Displaying columns with missing values
missing_columns = missing_values[missing_values > 0]
if not missing_columns.empty:
    print("Columns with Missing Values:\n", missing_columns)
else:
    print("No missing values in any column.")

# Checking for duplicate rows
duplicate_rows = df.duplicated().sum()
if duplicate_rows > 0:
    print("\nNumber of Duplicate Rows: ", duplicate_rows)
    # Displaying duplicate rows
    duplicate_data = df[df.duplicated()]
    print("\nDuplicate Rows:\n", duplicate_data)
else:
    print("\nNo duplicate rows found.")

In [None]:
# Removing duplicate rows
df_cleaned = df.drop_duplicates()

In [None]:
# analyze unknown values
object_columns = df.select_dtypes(include=['object']).columns

unknown_summary = {}
for column in object_columns:
    total_values = len(df[column])
    unknown_values = (df[column] == 'unknown').sum()
    percentage_unknown = (unknown_values / total_values) * 100
    unknown_summary[column] = percentage_unknown

# Displaying the percentage of unknown values per object column
print("Percentage of unknown values per object column:\n")
for column, percentage in unknown_summary.items():
    print(f"{column}: {percentage:.2f}%")

In [None]:
# Understand the distribution of unknown values on the rows
# Check for unknown values in all object columns
object_columns = df.select_dtypes(include=['object']).columns

# Compute the number of unknown values per row
df['unknown_count'] = df[object_columns].apply(lambda row: (row == 'unknown').sum(), axis=1)

# Count rows with different numbers of unknown values
unknown_count_distribution = df['unknown_count'].value_counts().sort_index()

# Total number of records
total_records = len(df)

# Displaying the distribution of rows with 1 to 6 columns having unknown values
for i in range(1, len(object_columns) + 1):
    count = unknown_count_distribution.get(i, 0)
    percentage = (count / total_records) * 100
    print(f"Number of rows with {i} column(s) having 'unknown' value: {count} ({percentage:.2f}%)")


In [None]:
# Try to handle  unknown values with df_cleaned data
# Impute function for mode
def impute_mode(df, column):
    mode_value = df[column].mode()[0]
    df.loc[df[column] == 'unknown', column] = mode_value

# Impute function for creating a new category
def impute_new_category(df, column, new_category):
    df.loc[df[column] == 'unknown', column] = new_category

# Impute missing values based on strategy
impute_mode(df_cleaned, 'job')
impute_mode(df_cleaned, 'marital')
impute_mode(df_cleaned, 'education')
impute_new_category(df_cleaned, 'default', 'unknown_default')
impute_mode(df_cleaned, 'housing')
impute_mode(df_cleaned, 'loan')



# Preprocessing Data for Training

In [None]:
# Conver object columns to numerical for df
# Function to determine if a column is numerical
def is_numerical(column):
    return pd.api.types.is_numeric_dtype(column)

numerical_columns = [col for col in df.columns if is_numerical(df[col])]
categorical_columns = [col for col in df.columns if not is_numerical(df[col])]

# One-Hot Encoding for nominal categorical variables
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Label Encoding for the target variable
label_encoder = LabelEncoder()
df['y_yes'] = label_encoder.fit_transform(df['y_yes'])



In [None]:
# Conver object columns to numerical for df_cleaned
numerical_columns = [col for col in df_cleaned.columns if is_numerical(df_cleaned[col])]
categorical_columns = [col for col in df_cleaned.columns if not is_numerical(df_cleaned[col])]

# One-Hot Encoding for nominal categorical variables
df_cleaned = pd.get_dummies(df_cleaned, columns=categorical_columns, drop_first=True)

# Label Encoding for the target variable
label_encoder = LabelEncoder()
df_cleaned['y_yes'] = label_encoder.fit_transform(df_cleaned['y_yes'])


In [None]:
# Handle outlier for df_clean data
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns
# Handle outlier
# Determine the lower and upper thresholds for clipping (1st and 99th percentiles)
lower_bound = df_cleaned[numeric_cols].quantile(0.01)
upper_bound = df_cleaned[numeric_cols].quantile(0.99)

# Apply clipping
df_cleaned = df_cleaned[numeric_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)


In [None]:
X = df.drop(columns=['y_yes'])
# Keep feature names
X_feature_names = X.columns
y = df['y_yes']


In [None]:
X_cleaned = df_cleaned.drop(columns=['y_yes'])
# Keep feature names
X_cleaned_feature_names = X_cleaned.columns
y_cleaned = df_cleaned['y_yes']


In [None]:
#split for both data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
X_cleaned_train, X_cleaned_test, y_cleaned_train, y_cleaned_test = train_test_split(
    X_cleaned, y_cleaned, test_size=0.2, random_state=42, stratify=y_cleaned
)

scaler = StandardScaler()
X_cleaned_train_scaled = scaler.fit_transform(X_cleaned_train)
X_train_scaled = scaler.fit_transform(X_train)
X_cleaned_test_scaled = scaler.fit_transform(X_cleaned_test)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
pca = PCA()
pca.fit(X_train_scaled)

# Calculate cumulative explained variance
cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()

# Find the number of components for 98%, 95%, and 90% explained variance
components_98 = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.98) + 1
components_95 = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.95) + 1
components_90 = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.90) + 1

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(cumulative_explained_variance, marker='o')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.axvline(x=components_95 - 1, color='g', linestyle='--')
plt.title('Cumulative Explained Variance by Number of Principal Components for data not cleaned')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
plt.show()
print(f'Number of components for 98% explained variance for df : {components_98}')
print(f'Number of components for 95% explained variance for df: {components_95}')
print(f'Number of components for 90% explained variance for df: {components_90}')


In [None]:
pca = PCA()
pca.fit(X_cleaned_train_scaled)

# Calculate cumulative explained variance
cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()

# Find the number of components for 98%, 95%, and 90% explained variance
components_98 = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.98) + 1
components_95 = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.95) + 1
components_90 = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.90) + 1

# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(cumulative_explained_variance, marker='o')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.axvline(x=components_95 - 1, color='g', linestyle='--')
plt.title('Cumulative Explained Variance by Number of Principal Components for data after cleaned')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
plt.show()
print(f'Number of components for 98% explained variance for df_cleaned after preprocessing: {components_98}')
print(f'Number of components for 95% explained variance for df_cleaned after preprocessing: {components_95}')
print(f'Number of components for 90% explained variance for df_cleaned after preprocessing: {components_90}')

# Feature Engineering


In [None]:
# Initialize LassoCV with cross-validation
lasso_cv = LassoCV(alphas=None, cv=10, max_iter=10000)

# Fit the model
lasso_cv.fit(X_train_scaled, y_train)

# Get the best alpha
best_alpha = lasso_cv.alpha_

print(f"Best alpha: {best_alpha}")

In [None]:
# Initialize Lasso with the best alpha
lasso = Lasso(alpha=best_alpha, max_iter=10000)

# Fit the model
lasso.fit(X_train_scaled, y_train)

# Get the coefficients
coefficients = lasso.coef_


# Retrieve the coefficients
coefficients = lasso.coef_

# Plot the coefficients
plt.figure(figsize=(10, 6))
plt.plot(range(len(coefficients)), coefficients, marker='o', linestyle='-', linewidth=2)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Lasso Coefficients for data not cleaned')
plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.grid(True)
plt.show()


# Select non-zero coefficients
selected_features_lasso = X_feature_names[coefficients != 0]

print("Selected features by Lasso with best alpha:")
print(selected_features_lasso)

In [None]:
# Initialize LassoCV with cross-validation
lasso_cv = LassoCV(alphas=None, cv=10, max_iter=10000)

# Fit the model
lasso_cv.fit(X_cleaned_train_scaled, y_cleaned_train)

# Get the best alpha
best_alpha_cleaned = lasso_cv.alpha_

print(f"Best alpha for cleaned data: {best_alpha_cleaned}")

In [None]:
# Initialize Lasso with the best alpha
lasso = Lasso(alpha=best_alpha_cleaned, max_iter=10000)

# Fit the model
lasso.fit(X_cleaned_train_scaled, y_cleaned_train)

# Get the coefficients
coefficients = lasso.coef_

# Plot the coefficients
plt.figure(figsize=(10, 6))
plt.plot(range(len(coefficients)), coefficients, marker='o', linestyle='-', linewidth=2)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Lasso Coefficients for data cleaned')
plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.grid(True)
plt.show()


# Select non-zero coefficients
selected_features_lasso_cleaned = X_cleaned_feature_names[coefficients != 0]

print("Selected features by Lasso with best alpha for cleaned data:")
print(selected_features_lasso_cleaned)

# Create new data set with top features from Lasso


In [None]:

X_selected = X[selected_features_lasso]
X_selected_cleaned = X_cleaned[selected_features_lasso_cleaned]

#split for both data
X_selected_train, X_selected_test, y_selected_train, y_selected_test = train_test_split(X_selected, y, test_size=0.2, random_state=42,stratify=y)
X_selected_cleaned_train, X_selected_cleaned_test, y_selected_cleaned_train, y_selected_cleaned_test = train_test_split(
    X_selected_cleaned, y_cleaned, test_size=0.2, random_state=42, stratify=y_cleaned
)

scaler = StandardScaler()
X_selected_cleaned_train_scaled = scaler.fit_transform(X_selected_cleaned_train)
X_selected_train_scaled = scaler.fit_transform(X_selected_train)
X_selected_cleaned_test_scaled = scaler.fit_transform(X_selected_cleaned_test)
X_selected_test_scaled = scaler.fit_transform(X_selected_test)

# Compute the base line

In [None]:
# Establish the baseline performance
majority_class = pd.Series(y_selected_train).mode()[0]
baseline_accuracy = (y_selected_test == majority_class).mean()

print("Majority class in the training set: ", majority_class)
print("Baseline accuracy (majority class classifier): ", baseline_accuracy)

# Establish the baseline performance
majority_class = pd.Series(y_selected_cleaned_train).mode()[0]
baseline_accuracy = (y_selected_cleaned_test == majority_class).mean()

print("Majority class in the training set with selected fature after clean : ", majority_class)
print("Baseline accuracy (majority class classifier) with selected fature after clean : ", baseline_accuracy)

# Use Logistic Regression to build a basic model on your data and score the model

In [None]:


# Training Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_selected_train_scaled, y_selected_train)

# Predicting on the test set
y_pred = model.predict(X_selected_test_scaled)

# Evaluating the model
accuracy = accuracy_score(y_selected_test, y_pred)
class_report = classification_report(y_selected_test, y_pred)

print("Logistic Regression Model Accuracy: ", accuracy)
print("\nClassification Report:\n", class_report)


# Compute confusion matrix
cm = confusion_matrix(y_selected_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Retrieve the coefficients
coefficients = model.coef_[0]
feature_names = [f'Feature {i}' for i in range(X_selected_train_scaled.shape[1])]

# Plot the coefficients
plt.figure(figsize=(10, 6))
plt.bar(feature_names, coefficients)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Logistic Regression Coefficients')
plt.xlabel('Feature')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=45)
plt.show()

In [None]:
y_prob = model.predict_proba(X_selected_train_scaled)[:, 1]
# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_selected_train, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Training Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_selected_cleaned_train_scaled, y_selected_cleaned_train)

# Predicting on the test set
y_pred = model.predict(X_selected_cleaned_test_scaled)

# Evaluating the model
accuracy = accuracy_score(y_selected_cleaned_test, y_pred)
conf_matrix = confusion_matrix(y_selected_cleaned_test, y_pred)
class_report = classification_report(y_selected_cleaned_test, y_pred)

print("cleaned data - Logistic Regression Model Accuracy: ", accuracy)
print("\nCleaned data - Confusion Matrix:\n", conf_matrix)
print("\nCleaned data -  Classification Report:\n", class_report)

# Model Comparisons with default model 

Now, we aim to compare the performance of the Logistic Regression model to our KNN algorithm, Decision Tree, and SVM models. Using the default settings for each of the models, fit and score each. Also, be sure to compare the fit time of each of the models. Present your findings in a DataFrame similar to that below:
Model	Train Time	Train Accuracy	Test Accuracy

In [None]:
# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC()
}

# DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

# Train, evaluate, and record the results for each model
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_selected_train_scaled, y_selected_train)
    train_time = time.time() - start_time

    train_accuracy = accuracy_score(y_selected_train, model.predict(X_selected_train_scaled))
    test_accuracy = accuracy_score(y_selected_test, model.predict(X_selected_test_scaled))

    result = pd.DataFrame([[model_name, train_time, train_accuracy, test_accuracy]],
                          columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

    results = pd.concat([results, result], ignore_index=True)

# Display the results
print(results)

In [None]:
# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC()
}

# DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

# Train, evaluate, and record the results for each model
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_selected_cleaned_train_scaled, y_selected_cleaned_train)
    train_time = time.time() - start_time

    train_accuracy = accuracy_score(y_selected_cleaned_train, model.predict(X_selected_cleaned_train_scaled))
    test_accuracy = accuracy_score(y_selected_cleaned_test, model.predict(X_selected_cleaned_test_scaled))

    result = pd.DataFrame([[model_name, train_time, train_accuracy, test_accuracy]],
                          columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

    results = pd.concat([results, result], ignore_index=True)

# Display the results
print(results)

In [None]:
# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC()
}

# DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

# Train, evaluate, and record the results for each model
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - start_time

    train_accuracy = accuracy_score(y_train, model.predict(X_train_scaled))
    test_accuracy = accuracy_score(y_test, model.predict(X_test_scaled))

    result = pd.DataFrame([[model_name, train_time, train_accuracy, test_accuracy]],
                          columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

    results = pd.concat([results, result], ignore_index=True)

# Display the results
print(results)

# Using gridseachCV to find best hyperparameter and compare

In [None]:

# List of models to evaluate with parameter grids
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'max_iter': [5000]
    }),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }),
    'Support Vector Machine': (SVC(), {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'max_iter': [5000]
    })
}

# DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

# Train, evaluate, and record the results for each model
for model_name, (model, param_grid) in models.items():
    start_time = time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_selected_train_scaled, y_selected_train)
    train_time = time.time() - start_time

    best_model = grid_search.best_estimator_
    train_accuracy = accuracy_score(y_selected_train, best_model.predict(X_selected_train_scaled))
    test_accuracy = accuracy_score(y_selected_test, best_model.predict(X_selected_test_scaled))

    result = pd.DataFrame([[model_name, train_time, train_accuracy, test_accuracy]],
                          columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

    results = pd.concat([results, result], ignore_index=True)

# Display the results
print(results)

In [None]:
# List of models to evaluate with parameter grids
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'max_iter': [5000]
    }),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }),
    'Support Vector Machine': (SVC(), {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'max_iter': [5000]
    })
}

# DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

# Train, evaluate, and record the results for each model
for model_name, (model, param_grid) in models.items():
    start_time = time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train_scaled, y_train)
    train_time = time.time() - start_time

    best_model = grid_search.best_estimator_
    train_accuracy = accuracy_score(y_train, best_model.predict(X_train_scaled))
    test_accuracy = accuracy_score(y_test, best_model.predict(X_test_scaled))

    result = pd.DataFrame([[model_name, train_time, train_accuracy, test_accuracy]],
                          columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

    results = pd.concat([results, result], ignore_index=True)

# Display the results
print(results)

In [None]:
# List of models to evaluate with parameter grids
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'max_iter': [5000]
    }),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }),
    'Support Vector Machine': (SVC(), {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'max_iter': [5000]
    })
}

# DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

# Train, evaluate, and record the results for each model
for model_name, (model, param_grid) in models.items():
    start_time = time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_selected_cleaned_train_scaled, y_selected_cleaned_train)
    train_time = time.time() - start_time

    best_model = grid_search.best_estimator_
    train_accuracy = accuracy_score(y_selected_cleaned_train, best_model.predict(X_selected_cleaned_train_scaled))
    test_accuracy = accuracy_score(y_selected_cleaned_test, best_model.predict(X_selected_cleaned_test_scaled))

    result = pd.DataFrame([[model_name, train_time, train_accuracy, test_accuracy]],
                          columns=['Model', 'Train Time', 'Train Accuracy', 'Test Accuracy'])

    results = pd.concat([results, result], ignore_index=True)

# Display the results
print(results)