In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense
from keras.optimizers import Adam
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.callbacks import EarlyStopping

# Load and preprocess data
store_data = pd.read_feather('store_data.feather')
store_data.to_csv("store_data.csv", index=False)

# Remove rows from 2021 (COVID time)
store_data = store_data[store_data['date'] >= '2023-01-01']

# Ensure 'date' is in datetime format
store_data['date'] = pd.to_datetime(store_data['date'])

# One-hot encoding for region
store_data = pd.get_dummies(data=store_data, columns=["region"], drop_first=True)

# Convert boolean columns to binary
def convert_boolean_to_binary(dummy):
    for column in dummy.select_dtypes(include='bool').columns:
        dummy[column] = dummy[column].astype(int)
    return dummy

store_data = convert_boolean_to_binary(store_data)

# Add season feature
store_data = store_data.sort_values(by='date')
store_data['year'] = store_data['date'].dt.year
store_data['month'] = store_data['date'].dt.month
store_data['week'] = store_data['date'].dt.isocalendar().week

def get_season_from_month(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

store_data['season'] = store_data['month'].apply(get_season_from_month)

# Log transform the target variable to stabilize variance
store_data['turnover'] = np.log1p(store_data['turnover'])

# Remove rows with any NaN values in the dataframe
store_data.dropna(inplace=True)

# Feature selection
features = ['latitude', 'store_area', 'competitor_count', 'footfall', 
            'avg_temperature', 'precipitation_mm', 'wind_direction_degrees', 
            'peak_wind_gust_kmh', 'region_Bayern', 'region_Berlin', 
            'region_Brandenburg', 'region_Bremen', 'region_Hamburg', 'region_Hessen', 
            'region_Mecklenburg-vorpommern', 'region_Niedersachsen', 'region_Nordrhein-westfalen', 
            'region_Rheinland-pfalz', 'region_Saarland', 'region_Sachsen', 'region_Sachsen-anhalt', 
            'region_Schleswig-holstein', 'region_Thüringen', 'season', 'year', 'month', 'week']
target = 'turnover'

x = store_data[features]
y = store_data[target]

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Normalize and one-hot encode
numerical_features = ['latitude', 'store_area', 'competitor_count', 'footfall', 'avg_temperature', 
                      'precipitation_mm', 'wind_direction_degrees', 'peak_wind_gust_kmh', 
                      'year', 'month', 'week']
categorical_features = ['region_Bayern', 'region_Berlin', 'region_Brandenburg', 'region_Bremen', 
                        'region_Hamburg', 'region_Hessen', 'region_Mecklenburg-vorpommern', 'region_Niedersachsen', 
                        'region_Nordrhein-westfalen', 'region_Rheinland-pfalz', 'region_Saarland', 'region_Sachsen', 
                        'region_Sachsen-anhalt', 'region_Schleswig-holstein', 'region_Thüringen', 'season']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    sparse_threshold=0
)

x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

# Reshape for LSTM/BiLSTM
def reshape_for_lstm(x_data, timesteps=10):
    x_reshaped = []
    for i in range(len(x_data) - timesteps):
        x_reshaped.append(x_data[i:i+timesteps])
    return np.array(x_reshaped)

timesteps = 10
x_train_lstm = reshape_for_lstm(x_train_processed, timesteps)
x_test_lstm = reshape_for_lstm(x_test_processed, timesteps)
y_train_lstm = y_train.iloc[timesteps:].values
y_test_lstm = y_test.iloc[timesteps:].values

# Build and train BiLSTM model with EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

bilstm_model = Sequential([
    Bidirectional(LSTM(50, return_sequences=False), input_shape=(x_train_lstm.shape[1], x_train_lstm.shape[2])),
    Dense(1)
])

bilstm_model.compile(optimizer=Adam(), loss='mse')
bilstm_model.fit(x_train_lstm, y_train_lstm, epochs=100, batch_size=32, 
                 validation_data=(x_test_lstm, y_test_lstm), callbacks=[early_stopping], verbose=1)  # Keep verbose output

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train_processed, y_train)

# Generate future predictions for all stores (next 14 days)
future_predictions = []

for store_id in store_data['store_no'].unique():
    latest_data = store_data[store_data['store_no'] == store_id].iloc[-1:]
    latest_data = latest_data.drop(['date', 'turnover'], axis=1)
    latest_data_processed = preprocessor.transform(latest_data[features])

    if latest_data_processed.shape[0] < timesteps:
        latest_data_processed = np.tile(latest_data_processed, (timesteps, 1))
        latest_data_processed = latest_data_processed[-timesteps:]

    latest_data_lstm = np.reshape(latest_data_processed, (1, timesteps, latest_data_processed.shape[1]))
    bilstm_future_preds = bilstm_model.predict(latest_data_lstm, verbose=0)  # Suppress verbose output for prediction
    rf_future_preds = rf_model.predict(latest_data_processed)
    ensemble_future_preds = (bilstm_future_preds.flatten() + rf_future_preds) / 2
    future_turnover_ensemble = np.expm1(ensemble_future_preds.flatten())
    future_turnover_ensemble = np.tile(future_turnover_ensemble, 14)

    future_dates = pd.date_range(start=store_data['date'].max() + pd.Timedelta(days=1), periods=14, freq='D')
    future_df = pd.DataFrame({
        'store_no': [store_id] * 14,
        'date': future_dates,
        'Predicted Turnover (Ensemble)': future_turnover_ensemble[:14]
    })

    future_predictions.append(future_df)

# Combine all store predictions into one DataFrame
future_predictions_df = pd.concat(future_predictions, ignore_index=True)

# Ensure columns are separate and properly formatted
future_predictions_df = future_predictions_df[['date', 'store_no', 'Predicted Turnover (Ensemble)']]

# Rename columns to the desired format
future_predictions_df.rename(columns={'Predicted Turnover (Ensemble)': 'turnover'}, inplace=True)

# Save to Excel with proper column separation
future_predictions_df.to_excel('future_turnover_predictions_all_stores_BILSTM_AND_RANDOM_FOREST.xlsx', index=False)

# Print the first few rows of the predictions for inspection
print(future_predictions_df.head())

#---------------------------------------------------------------

# Adjust indices for Random Forest predictions
rf_predictions_test = rf_model.predict(x_test_processed[timesteps:])  # Use aligned data for RF predictions

# BiLSTM predictions
bilstm_predictions_test = bilstm_model.predict(x_test_lstm, verbose=0).flatten()

# Combine predictions from BiLSTM and Random Forest
ensemble_predictions_test = (bilstm_predictions_test + rf_predictions_test) / 2

# Reverse the log1p transformation for predictions and actuals
ensemble_predictions_test_exp = np.expm1(ensemble_predictions_test)
y_test_lstm_exp = np.expm1(y_test_lstm)  # Ensure y_test_lstm is used as it matches the BiLSTM alignment

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_lstm_exp, ensemble_predictions_test_exp))

# Calculate turnover mean
turnover_mean = np.mean(y_test_lstm_exp)

# Calculate error percentage
error_percentage = (rmse / turnover_mean) * 100

# Print results
print(f"RMSE: {rmse:.2f}")
print(f"Mean Turnover: {turnover_mean:.2f}")
print(f"Error Percentage: {error_percentage:.2f}%")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
        date  store_no   turnover
0 2023-12-31      2607  89.610743
1 2024-01-01      2607  89.610743
2 2024-01-02      2607  89.610743
3 2024-01-03      2607  89.610743
4 2024-01-04      2607  89.610743
RMSE: 38.97
Mean Turnover: 132.74
Error Percentage: 29.36%
