In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import seaborn as sns
from scipy.stats import f_oneway
import scipy.stats as stats

In [None]:
pip install statsmodels


In [None]:
!pip install statsmodels


In [None]:
# Load raw data
file_path = "../Data/Cleaned/income2024_cleaned.csv"

data_log_1 = pd.read_csv(file_path)
display(data_log_1.head())


In [None]:
#Ensuring that the label gets a string value instead of a numeric value
label_mapping = {
    -1: "Early Delivery",
    0: "On-time Delivery",
    1: "Delayed Delivery"
}
data_log_1["label"] = data_log_1["label"].map(label_mapping)
data_log_1.head()

## UNIVARIATE ANALYSIS ##



In [None]:
#Drop specific columns : The customer_
data_log_2 = data_log_1.drop(columns=["shipping_daet", "oder_date"], errors ='ignore')
display(data_log_2.head())

In [None]:
#Separate numerical and categorical columns
num_cols = data_log_2.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data_log_2.select_dtypes(include=['object']).columns

In [None]:
# Summary statistics for numerical data
print("\nSummary statistics for numerical data: ")
display(data_log_2[num_cols].describe())

In [None]:
#Value counts for categorical data
print("\nValue counts for categorical data: ")
for col in cat_cols: 
    print(f"\n{col}:\n", data_log_2[col].value_counts())

In [None]:
# Histogram for numerical features
# Histograms for numerical features
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data_log_1[col], bins=30, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
#Boxplots for numerical features: check for outliers
for col in num_cols:
    plt.figure9figsize=(10, 6)
    sns.boxplot(x=data_log_1[col])
    plt.title(f"{col}")
    plt.show()

In [None]:
for col in cat_cols:
    plt.figure(figsize=(10, 5))
    data_log_1[col].value_counts().nlargest(10).plot(kind='bar', color='skyblue')
    plt.title(f"{col}")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#How does the number of orders differ by shipping mode 
temp=data_log_2['shipping_mode'].value_counts()
plt.figure(figsize=(6,6))
ax=sns.barplot(x=temp.index,y=temp.values,palette='viridis')
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')
#ax.set_xticklabels([label.replace(' ', '\n') for label in data['customer_segment'].unique()])

plt.xlabel("shipping_mode")
plt.ylabel("Count of orders")
plt.title("Number of orders per shipping_mode")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure 'order_date' is in datetime format
data_log_2['order_date'] = pd.to_datetime(data_log_2['order_date'], errors='coerce')

# Copy the dataset and extract the year from 'order_date'
temp = data_log_2.copy()
temp['year'] = temp['order_date'].dt.year

# Define categorical and numerical columns for analysis
categories = ['market', 'customer_segment', 'department_id', 'category_id']
numeric_metrics = ['order_item_quantity', 'sales', 'profit_per_order']

# Group data by category and calculate total sales
total_sales_by_category = temp.groupby(categories)['sales'].sum().reset_index()
print(total_sales_by_category)

# Plot total sales by category using a bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='market', y='sales', hue='customer_segment', data=total_sales_by_category)
plt.title("Total Sales by Market and Customer Segment")
plt.xlabel("Market")
plt.ylabel("Total Sales")
plt.legend(title="Customer Segment")
plt.xticks(rotation=45)
plt.show()

for category in categories:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))  # Create a row of 3 subplots
    
    if category in ['department_id', 'category_id']:
        for idx, metric in enumerate(numeric_metrics):
            # Find the top 4 departments/categories based on total metric value
            top_values = temp.groupby(category)[metric].sum().nlargest(4).index
            
            # Filter dataset to include only top 4 departments/categories
            filtered_temp = temp[temp[category].isin(top_values)]
            
            # Group by category and year, then plot the data
            grouped_data = filtered_temp.groupby([category, 'year'])[metric].sum().unstack(category)
            grouped_data.plot(kind='bar', ax=axes[idx])
            
            axes[idx].set_title(f'Yearly {metric} for Top 4 {category}s')
            axes[idx].set_ylabel(f'Total {metric}')
    
    else:
        for idx, metric in enumerate(numeric_metrics):
            # Group by category and year, then plot the data
            grouped_data = temp.groupby([category, 'year'])[metric].sum().unstack(category)
            grouped_data.plot(kind='bar', ax=axes[idx])
            
            axes[idx].set_title(f'Yearly {metric} per {category}')
            axes[idx].set_ylabel(f'Total {metric}')
    
    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Analyze sales distribution across different countries
temp = data_log_2.copy()

# Group sales data by country and sort in descending order
top_countries = temp.groupby('order_country')['sales'].sum().reset_index()
top_countries = top_countries.sort_values(by='sales', ascending=False).head(20)

# Create a bar plot to visualize total sales by country
plt.figure(figsize=(19, 6))
ax = sns.barplot(data=top_countries, x='order_country', y='sales', palette='viridis')

# Annotate bars with sales values
for bar in ax.patches:
    height = bar.get_height()
    ax.annotate(f'{int(height)}', (bar.get_x() + bar.get_width() / 2., height),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Format x-axis labels for better readability
ax.set_xticklabels([label.replace(' ', '\n') for label in top_countries['order_country'].unique()])

# Set plot labels and title
plt.xlabel("Country")
plt.ylabel("Total Sales")
plt.title("Total Sales by Country")

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group by department and calculate total sales
department_sales = data_log_2.groupby('department_name')['sales'].sum().reset_index()

# Sort by sales in descending order
department_sales = department_sales.sort_values(by='sales', ascending=False)

# Plot the bar chart
plt.figure(figsize=(18.5, 6))
ax = sns.barplot(data=department_sales, x='department_name', y='sales', palette='viridis')

# Annotate bars with total sales values
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', 
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Format x-axis labels for better readability
ax.set_xticklabels([label.replace(' ', '\n') for label in department_sales['department_name']])

# Labels and title
plt.xlabel("Department")
plt.ylabel("Total Sales")
plt.title("Total Sales by Department")

# Show the plot
plt.show()


In [None]:
store_data = data_log_2[["department_name", 
                    "latitude",
                    "longitude",
                    "sales",
                    "profit_per_order",
                    "category_name",
                    "order_item_discount",
                    "order_item_discount_rate",
                    "order_item_product_price",
                    "order_item_quantity",
                    "order_item_total_amount",
                    "order_profit_per_order",
                    "order_city", 
                    "order_country",
                    "order_status",
                    "product_name",
                    "product_price",
                    "shipping_mode",
                    "label"
                   ]]

                    

In [None]:
#Visualizing store locations according to number of purchases 


## BIVARIATE ANALYSIS ##

In [None]:
#categorical variables to compare
cat_var = [('customer_segment', 'order_status'), ('payment_type', 'order_status')]

for var1, var2 in cat_var:
    crosstab = pd.crosstab(data_log_2[var1], data_log_2[var2])
    chi2, p, dof, expected = chi2_contingency(crosstab)

print(f"\nChi-Square test for {var1} vs {var2}")
print(f"Chi-square statistics: {chi2}, p-value: {p}")


* This test indicates that there is a strong dependency between payment_type and order_status which suggest that order status is strongly affected by the payment type 

In [None]:
# Boxplot for visualization
plt.figure(figsize=(10,2))
sns.boxplot(x=data_log_2["customer_segment"], y=data_log_2["sales_per_customer"])
plt.title("Sales per Customer by Segment")
plt.xticks(rotation=45)
plt.show()

# ANOVA Test 
segments = data_log_2["customer_segment"].unique()
groups = [data_log_2[data_log_2["customer_segment"] == seg]["sales_per_customer"].dropna() for seg in segments]
anova_result = f_oneway(*groups)

print(f"ANOVA Test for Customer Segment vs Sales per Customer")
print(f"F-Statistic: {anova_result.statistic}, P-Value: {anova_result.pvalue}")


* There is no statistically significant difference in sales_per_customer across different customer_segments
* The differences in sales across customer segments are as a result of random varaition 
* This suggest that customer segment does not strongly influence sales per customer in this dataset

In [None]:
# Scatter Plot
plt.figure(figsize=(8,5))
sns.scatterplot(x=data_log_2["order_item_product_price"], y=data_log_2["order_item_quantity"])
plt.title("Product Price vs Order Quantity")
plt.show()

# Pearson Correlation
corr, p_value = stats.pearsonr(data_log_2["order_item_product_price"], data_log_2["order_item_quantity"])
print(f"Pearson Correlation: {corr}, P-Value: {p_value}")


In [None]:
# Categorical variables to compare with delivery label
cat_vars = [('label', 'shipping_mode'), ('label', 'order_status')]

for var1, var2 in cat_vars:
    crosstab = pd.crosstab(data_log_2[var1], data_log_2[var2])
    chi2, p, dof, expected = chi2_contingency(crosstab)
    
    print(f"\nChi-Square Test for {var1} vs {var2}")
    print(f"Chi-Square Statistic: {chi2}, P-Value: {p}")
    
    # Visualize with heatmap
    plt.figure(figsize=(8,5))
    sns.heatmap(crosstab, annot=True, cmap="Blues", fmt="d")
    plt.title(f"Heatmap of {var1} vs {var2}")
    plt.show()


In [None]:
import scipy.stats as stats

# Scatter Plot
plt.figure(figsize=(8,5))
sns.scatterplot(x=data_log_2["order_item_total_amount"], y=data_log_2["order_item_discount"], hue=data_log_2["label"])
plt.title("Total Order Amount vs Discount by Delivery Status")
plt.show()

# Pearson Correlation
corr, p_value = stats.pearsonr(data_log_2["order_item_total_amount"], data_log_2["order_item_discount"])
print(f"Pearson Correlation: {corr}, P-Value: {p_value}")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway, chi2_contingency
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load dataset
df = data_log_2 # Ensure the dataset is preloaded

# Convert categorical variables to string type
df["label"] = df["label"].astype(str)
df["shipping_mode"] = df["shipping_mode"].astype(str)
df["latitude"] = df["latitude"].astype(str)
df["longitude"] = df["longitude"].astype(str)


In [None]:
print(df.columns)


In [None]:
df["order_processing_time"] = (pd.to_datetime(df["shipping_date_only"]) - pd.to_datetime(df["order_date_only"])).dt.days


In [None]:
# Ensure order_processing_time exists
if "order_processing_time" not in df.columns:
    df["order_processing_time"] = (pd.to_datetime(df["shipping_date_only"]) - pd.to_datetime(df["order_date_only"])).dt.days

# Boxplot to visualize processing time by delay status
plt.figure(figsize=(8,5))
sns.boxplot(x=df["label"], y=df["order_processing_time"])
plt.title("Order Processing Time vs Shipping Delay Status")
plt.xlabel("Delivery Status")
plt.ylabel("Order Processing Time (days)")
plt.show()

# ANOVA test
groups = [df[df["label"] == lbl]["order_processing_time"].dropna() for lbl in df["label"].unique()]
anova_result = f_oneway(*groups)

print(f"ANOVA Test Results for Order Processing Time vs Shipping Delays")
print(f"F-Statistic: {anova_result.statistic:.4f}, P-Value: {anova_result.pvalue:.4f}")


In [None]:
# Create contingency table
crosstab = pd.crosstab(df["label"], df["shipping_mode"])

# Perform Chi-Square Test
chi2, p, dof, expected = chi2_contingency(crosstab)

print(f"Chi-Square Test for Shipping Method vs Shipping Delay")
print(f"Chi-Square Statistic: {chi2:.4f}, P-Value: {p:.4f}")

# Visualize with heatmap
plt.figure(figsize=(8,5))
sns.heatmap(crosstab, annot=True, cmap="Blues", fmt="d")
plt.title("Shipping Method vs Delivery Status")
plt.xlabel("Shipping Method")
plt.ylabel("Delivery Status")
plt.show()


In [None]:
# Ensure 'order_date' is in datetime format
data_log_2['order_date'] = pd.to_datetime(data_log_2['order_date'], errors='coerce')

# Copy the dataset and extract the year from 'order_date'
temp = data_log_2.copy()
temp['year'] = temp['order_date'].dt.year

# Define categorical and numerical columns for analysis
categories = ['market', 'customer_segment', 'department_id', 'category_id']
numeric_metrics = ['order_item_quantity', 'sales', 'profit_per_order']

for category in categories:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))  # Create a row of 3 subplots
    
    if category in ['department_id', 'category_id']:
        for idx, metric in enumerate(numeric_metrics):
            # Find the top 4 departments/categories based on total metric value
            top_values = temp.groupby(category)[metric].sum().nlargest(4).index
            
            # Filter dataset to include only top 4 departments/categories
            filtered_temp = temp[temp[category].isin(top_values)]
            
            # Group by category and year, then plot the data
            grouped_data = filtered_temp.groupby([category, 'year'])[metric].sum().unstack(category)
            grouped_data.plot(kind='bar', ax=axes[idx])
            
            axes[idx].set_title(f'Yearly {metric} for Top 4 {category}s')
            axes[idx].set_ylabel(f'Total {metric}')
    
    else:
        for idx, metric in enumerate(numeric_metrics):
            # Group by category and year, then plot the data
            grouped_data = temp.groupby([category, 'year'])[metric].sum().unstack(category)
            grouped_data.plot(kind='bar', ax=axes[idx])
            
            axes[idx].set_title(f'Yearly {metric} per {category}')
            axes[idx].set_ylabel(f'Total {metric}')
    
    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()


## Trend Analysis ##

In [None]:
#Delayed According to Order Status
# Create subplots
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

# Pie chart: Percentage of Delayed Orders
delayed_counts = data_log_2.groupby("label")["sales"].count().sort_values(ascending=False)
ax[0].pie(delayed_counts, labels=delayed_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Blues'))
ax[0].set_title("Percentage of Delayed Orders")

delayed_counts_df = data_log_2.groupby(["order_status","label"])[["sales"]].count().reset_index()
# Count plot: Delay According to Order Status
sns.barplot(
    data=delayed_counts_df,  # 
    y="order_status",
    x="sales",
    hue="label",
    palette="tab20",
    ax=ax[1],
    order=["COMPLETE", "PENDING_PAYMENT", "PROCESSING", "PENDING", "CLOSED", "ON_HOLD", "PAYMENT_REVIEW"],
    #h=["Early Delivery", "On-time Delivery", "Delayed Delivery"],
    width=0.6
).set(ylabel="", title="Delay According to Order Status")

# Adjust y-axis font size
ax[1].tick_params(axis="y", labelsize=6)

# Adjust layout and save figure
plt.subplots_adjust(wspace=0.3)
plt.savefig("percentage_of_delayed_orders.png")
plt.show()


In [None]:
data_log_2.groupby(["order_status", "label"])[["sales"]].count()

In [None]:
count_delayed_orders = data_log_2.shape[0] * 0.577

count_delayed_orders

In [None]:
# Dropping rows with "NaT" values in order_date and shipping_date column
data_log_2 = data_log_2.dropna(subset=["order_date", "shipping_date"])


# Calculating Shipping delays
# Convert order_date and shipping_date to datetime objects
data_log_2['order_date'] = pd.to_datetime(data_log_2['order_date'], errors="coerce", utc=True)
data_log_2['shipping_date'] = pd.to_datetime(data_log_2['shipping_date'], errors="coerce", utc=True)

clean_data = data_log_2.dropna(subset=["order_date", "shipping_date"]) # To prevent the error that would otherwise happen in the next step

# Calculate the delay as the difference between shipping and order dates
clean_data.loc[:, ['delay_days']] = (clean_data['shipping_date'] - clean_data['order_date']).dt.days

# Filter rows where delay_days is greater than a threshold
delayed_orders = clean_data[
    (clean_data['delay_days'] > 7) & 
    (clean_data["label"] == "Delayed Delivery") & 
    (clean_data["order_status"] != "PENDING_PAYMENT") & 
    (clean_data["order_status"] != "PENDING")
]

sample = delayed_orders[["delay_days", "product_name", "order_status", "order_country", "label"]]

count_treatable_delays = sample.shape[0]

display(count_treatable_delays)

percent_treatable_delays = (sample.shape[0] / data_log_2.shape[0]) * 100

display(percent_treatable_delays)

sample

In [None]:
# Group delays by regions, cities, or countries

delay_by_country = delayed_orders.groupby('order_country')['delay_days'].median().sort_values(ascending=False)

display(delay_by_country)

In [None]:
# Average shipping days for delayed products
fig, ax = plt.subplots(figsize=(10, 6))  

sns.barplot(
    y=delay_by_country[:10].index, 
    x=delay_by_country[:10].values, 
    palette="Blues_r",
    ax=ax  # 
).set(
    ylabel="Order Country", 
    xlabel="Average Number of Shipping Days", 
    title="Top 10 Countries With Longest Average Shipping (Delayed Products)"
)

plt.subplots_adjust(wspace=0.5)
plt.show()


## Modelling Delays

In [None]:
from sklearn.datasets import  fetch_california_housing
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Base plotting library
import seaborn as sns # Main plotting library
import geopandas as gpd
import folium # Mapping library
from IPython.display import display, IFrame # To display maps as cell outputs
from folium.plugins import MarkerCluster
from sklearn.model_selection import KFold # For dividing the dataset
from sklearn.preprocessing import StandardScaler # For scaling features
from sklearn.preprocessing import OneHotEncoder # For hot encoding data
from sklearn.preprocessing import LabelEncoder # For simpler data encoding
from sklearn.utils.class_weight import compute_class_weight # For dealing with different sizes of classification categories
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # For cross-validation evaluation



In [None]:
data_log_2.head()

In [None]:
delayed_orders.columns

In [None]:
# Drop missing values and unwanted columns
data = clean_data.dropna(axis=1, how="any")[["order_country", "order_status", "shipping_mode", "payment_type", "label", "department_name"]]

data.head()

In [None]:
# Separate features (X) and target variable (y)
X = data.drop(columns=['label'])
y = data['label']

# Handle categorical variables by encoding them
onehot = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = onehot.fit_transform(X.select_dtypes(include=['object']))
X_numeric = X.select_dtypes(exclude=['object'])

# Combine numerical features with one-hot encoded categorical features
X = np.hstack((X_numeric, X_encoded))


# Encoding y explicitly
le_label = LabelEncoder()
y = le_label.fit_transform(y)

# Computing class weights from the classification column
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

## Changing the weights to a dictionary to be able to be fed to the model
class_weights = {i: class_weights[i] for i in range(len(class_weights))} 
display(class_weights)

# Normalize the numerical features (optional but recommended for neural networks)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Creating KFolds for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []




# Loop over each fold
for train_index, val_index in kf.split(X_scaled):
    # Split the data into training and validation sets for this fold
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    
    # Defining the model
    model = Sequential([
    Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.01), input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(384, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.3),
    Dense(256, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.3),
    Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.3),
    Dense(64, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
    Dropout(0.3),
    Dense(3, activation='softmax')  # Multiclass Classification
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    
    history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_val, y_val), verbose=0, class_weight=class_weights)
    
    # Predict on the validation set
    y_pred_prob = model.predict(X_val)
    y_pred = np.argmax(y_pred_prob, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average="weighted")
    recall = recall_score(y_val, y_pred, average="weighted")
    f1 = f1_score(y_val, y_pred, average="weighted")
    
    # Store the metrics for this fold
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)


# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Display the results
print(f"Average Accuracy: {avg_accuracy * 100:.2f}%")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")

In [None]:
# Lists to store metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []


# Define a function that builds the model with hyperparameters
def create_model(hp):
    model = tf.keras.Sequential()
    
    # Tune the number of units in the first Dense layer
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(Dense(units=hp_units, activation='relu', input_shape=(X_train.shape[1],)))
    
    # Tune dropout rate
    hp_dropout = hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)
    model.add(Dropout(rate=hp_dropout))
    
    # Tune number of hidden layers
    for i in range(hp.Int('num_layers', 1, 8)):  # Between 1 to 4 hidden layers
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.2, max_value=0.5, step=0.1)))
    
    # Output layer
    model.add(Dense(3, activation='softmax'))  # multiclass classification
    
    # Tune the learning rate
    hp_learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
    
    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model


# Define the tuner
tuner = RandomSearch(
    create_model,
    objective='val_accuracy',  # Optimize for validation accuracy
    max_trials=5,  # Number of different models to try
    executions_per_trial=3,  # Number of times to train each model for robustness
    directory='my_dir',  # Directory to save logs and results
    project_name='tuning_delay_prediction'
)


# Loop over each fold
for train_index, val_index in kf.split(X_scaled):
    # Split the data into training and validation sets for this fold
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Perform the hyper-parameter search
    tuner.search(X_train, y_train, epochs=20, validation_data=(X_val, y_val))

    # Retrieve the best model and hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    
    # Print the best hyper-parameters
    print(f"The optimal number of units in the first hidden layer is {best_hps.get('units')}")
    print(f"The optimal learning rate is {best_hps.get('learning_rate')}")
    
    # Build and train the model using the best hyper-parameters
    model = tuner.hypermodel.build(best_hps)
    model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val),  verbose=0, class_weight=class_weights)
    
#     model = create_model()
#     history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_val, y_val), verbose=0, class_weight=class_weights)
    
    # Predict on the validation set
    y_pred_prob = model.predict(X_val)
    y_pred = np.argmax(y_pred_prob, axis=1)


    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average="weighted")
    recall = recall_score(y_val, y_pred, average="weighted")
    f1 = f1_score(y_val, y_pred, average="weighted")
    
    # Store the metrics for this fold
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Display the results
print(f"Average Accuracy: {avg_accuracy * 100:.2f}%")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1:.4f}")

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

# DecisionTreeClassifier
model1 = DecisionTreeClassifier(ccp_alpha=0.001, class_weight='balanced')
model1.fit(xtrain, ytrain)

print("Train Accuracy, Decision Tree",model1.score(xtrain, ytrain))
print("Test Accuracy, Decision Tree",model1.score(xtest, ytest))
print("DecisionTreeClassifier")
print(classification_report(ytrain, model1.predict(xtrain)))
print(classification_report(ytest, model1.predict(xtest)))

# RandomForestClassifier
model2 = RandomForestClassifier(n_estimators=100, ccp_alpha=0.01, class_weight='balanced')
model2.fit(xtrain, ytrain)

print("Train Accuracy, RandomForest",model2.score(xtrain, ytrain))
print("Test Accuracy, RandomForest",model2.score(xtest, ytest))
print("RandomForestClassifier")
print(classification_report(ytrain, model2.predict(xtrain)))
print(classification_report(ytest, model2.predict(xtest)))

# AdaBoostClassifier
model3 = AdaBoostClassifier(n_estimators=50, algorithm='SAMME')
model3.fit(xtrain, ytrain)

print("Train Accuracy, AdaBoost", model3.score(xtrain, ytrain))
print("Test Accuracy, AdaBoost", model3.score(xtest, ytest))
print("AdaBoostClassifier")
print(classification_report(ytrain, model3.predict(xtrain)))
print(classification_report(ytest, model3.predict(xtest)))