In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [8]:
# Directory containing the folders
base_dir = "../tables"

# Output file
output_file = "hyper_csv_metrics.csv"

In [4]:
# Function to calculate confidence intervals
def calculate_confidence_intervals(predictions, alpha=0.05):
    mean_pred = np.mean(predictions)
    std_pred = np.std(predictions)
    z_score = 1.96  # 95% confidence
    margin_of_error = z_score * (std_pred / np.sqrt(len(predictions)))
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

# Function to calculate prediction intervals
def calculate_prediction_intervals(actual, predictions, alpha=0.05):
    residuals = actual - predictions
    std_residual = np.std(residuals)
    z_score = 1.96  # 95% confidence
    margin_of_error = z_score * std_residual
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

# Function to calculate percent overlap
def calculate_overlap(lower1, upper1, lower2, upper2):
    overlap_count = sum((u1 >= l2) & (l1 <= u2) for l1, u1, l2, u2 in zip(lower1, upper1, lower2, upper2))
    percent_overlap = (overlap_count / len(lower1)) * 100
    return percent_overlap

In [13]:
file = '3month_predictionresults_batch_8_loss_mean_squared_error_epochs_100.csv'
folder_name = os.path.basename(root)
file_name = file
batch_size = int(folder_name.split("_")[1])
loss_type = folder_name.split("_")[3]
epochs = int(folder_name.split("_")[-1])
lookback = int(file_name.split("month")[0])

ValueError: invalid literal for int() with base 10: 'checkpoints'

In [10]:
# Initialize a list to store results
results = []

# Traverse the directory structure
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".csv"):
            print(file)
            folder_name = os.path.basename(root)
            file_name = file
            batch_size = int(folder_name.split("_")[1])
            loss_type = folder_name.split("_")[3]
            epochs = int(folder_name.split("_")[-1])
            lookback = int(file_name.split("month")[0])
            break

3month_predictionresults_batch_8_loss_mean_squared_error_epochs_100.csv
6month_predictionresults_batch_16_loss_mean_squared_error_epochs_50.csv
5month_predictionresults_batch_1_loss_mean_squared_error_epochs_50.csv
5month_predictionresults_batch_8_loss_mean_squared_error_epochs_50.csv
3month_predictionresults_batch_16_loss_mean_squared_error_epochs_100.csv
5month_predictionresults_batch_1_loss_mean_squared_error_epochs_100.csv
3month_predictionresults_batch_1_loss_mean_squared_error_epochs_100-checkpoint.csv


ValueError: invalid literal for int() with base 10: 'checkpoints'

In [15]:
def create_hyper_csv(input_directory, output_file):
    # Prepare the final hyper CSV structure
    results = []

    for file in os.listdir(input_directory):
        if file.endswith('.csv'):
            filepath = os.path.join(input_directory, file)
            try:
                # Parse hyperparameters from filename (adjusting to ignore 'loss')
                parts = file.split('_')
                lookback_period = int(parts[0].replace('month', ''))
                batch_size = int(parts[3].replace('batch', ''))
                epochs = int(parts[-1].replace('epochs', '').replace('.csv', ''))

                # Read the CSV
                df = pd.read_csv(filepath)
                original = df["Deaths"]
                lstm = df["LSTM Predictions"]
                sarima = df["SARIMA Predictions"]

                # Calculate metrics for LSTM and SARIMA
                lstm_rmse, lstm_mape = calculate_metrics(original, lstm)
                sarima_rmse, sarima_mape = calculate_metrics(original, sarima)

                # Confidence intervals and prediction intervals
                lower_lstm, upper_lstm = calculate_prediction_intervals(original, lstm)
                lower_sarima, upper_sarima = calculate_prediction_intervals(original, sarima)

                # Calculate percent overlap
                percent_overlap = calculate_overlap(lower_lstm, upper_lstm, lower_sarima, upper_sarima)

                # Append results
                results.append({
                    "Lookback Period": lookback_period,
                    "Batch Size": batch_size,
                    "Epochs": epochs,
                    "LSTM RMSE": lstm_rmse,
                    "LSTM MAPE": lstm_mape,
                    "SARIMA RMSE": sarima_rmse,
                    "SARIMA MAPE": sarima_mape,
                    "Percent Overlap": percent_overlap
                })
            except Exception as e:
                print(f"Error processing {file}: {e}")

    # Save results to a new CSV file
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, index=False)
    print(f"Hyper CSV created at {output_file}")

In [21]:
input_directory = "../tables/batch_1_loss_mse_epoch_50"  # Replace with your directory
output_file = "hyper_results_1_50.csv"
create_hyper_csv(input_directory, output_file)

Hyper CSV created at hyper_results_1_50.csv


In [23]:
import os
import pandas as pd

def stack_csvs(input_directory):
    """
    Stacks all CSV files in a directory into one DataFrame.
    
    Args:
        input_directory (str): Path to the directory containing the CSV files.

    Returns:
        pd.DataFrame: Combined DataFrame containing data from all CSV files.
    """
    csv_list = []  # List to store individual DataFrames

    for file in os.listdir(input_directory):
        if file.endswith('.csv'):  # Check if the file is a CSV
            filepath = os.path.join(input_directory, file)
            try:
                # Read the CSV file and append to the list
                df = pd.read_csv(filepath)
                csv_list.append(df)
            except Exception as e:
                print(f"Error reading {file}: {e}")

    # Combine all DataFrames in the list into one
    combined_df = pd.concat(csv_list, ignore_index=True)
    return combined_df

# Example usage
input_directory = "results"  # Replace with the path to your directory
combined_dataframe = stack_csvs(input_directory)

# Save the combined DataFrame to a new CSV file (optional)
combined_dataframe.to_csv("combined_output.csv", index=False)

print("All CSVs have been stacked into one DataFrame.")

All CSVs have been stacked into one DataFrame.
