In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import prophet
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU, RepeatVector, TimeDistributed
from keras.callbacks import EarlyStopping

In [None]:
# Load your data and rename # Date to Date
data = pd.read_csv('data_daily.csv')
data = data.rename(columns={'# Date': 'Date'})

In [None]:
print(data.isnull().values.any())

False


In [None]:
# Scaling data
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data['Receipt_Count'].values.reshape(-1, 1))

# Convert data to appropriate shape for LSTM
def create_dataset(dataset, look_back=90, forecast_horizon=30):
    X, Y = [], []
    for i in range(len(dataset) - look_back - forecast_horizon + 1):
        X.append(dataset[i:(i + look_back), 0])
        Y.append(dataset[i + look_back:i + look_back + forecast_horizon, 0])
    return np.array(X), np.array(Y)

X, y = create_dataset(scaled_data)

# Reshape input to be [samples, time steps, features]
X = X.reshape(X.shape[0], X.shape[1], 1)
y = y.reshape(y.shape[0], y.shape[1], 1)

# LSTM model
model = Sequential()
model.add(GRU(20, activation='relu', input_shape=(X.shape[1], X.shape[2])))
model.add(RepeatVector(30))  # Repeat the feature vector 30 times
model.add(GRU(10, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(optimizer='adam', loss='mse')

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(X, y, epochs=100, batch_size=1, verbose=1, callbacks=[early_stopping], validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.src.callbacks.History at 0x7a1bab81d570>

In [None]:
look_back_period = 90  # Set your desired look-back period

predictions_2022 = []

# Start with the last 'look_back_period' days of 2021
last_data = scaled_data[-look_back_period:]

for i in range(13):  # 13 chunks of predictions
    forecast = model.predict(last_data.reshape(1, look_back_period, 1))
    forecast_original = scaler.inverse_transform(forecast[0])

    if i == 12:  # On the 13th loop, only take the first 5 days
        predictions_2022.extend(forecast_original[:5].flatten())
    else:
        predictions_2022.extend(forecast_original.flatten())

    # Append the forecasted values to our 'last_data' and use the most recent 'look_back_period' for the next prediction
    last_data = np.vstack((last_data[30:], scaler.transform(forecast_original)))

# Ensure we have 365 days of predictions
assert len(predictions_2022) == 365

# Convert to DataFrame
forecast_dates = pd.date_range(start="2022-01-01", end="2022-12-31", freq='D')
forecast_df = pd.DataFrame({'Date': forecast_dates, 'Predictions': predictions_2022})

print(forecast_df)

# If you want monthly aggregates:
monthly_predictions = forecast_df.resample('M', on='Date').sum()
print(monthly_predictions)

          Date  Predictions
0   2022-01-01    9662330.0
1   2022-01-02    9749521.0
2   2022-01-03    9750557.0
3   2022-01-04    9748738.0
4   2022-01-05    9750662.0
..         ...          ...
360 2022-12-27    9579056.0
361 2022-12-28    9679143.0
362 2022-12-29    9690580.0
363 2022-12-30    9694700.0
364 2022-12-31    9700257.0

[365 rows x 2 columns]
            Predictions
Date                   
2022-01-31  303711584.0
2022-02-28  273874816.0
2022-03-31  302701568.0
2022-04-30  292704192.0
2022-05-31  302232096.0
2022-06-30  292634688.0
2022-07-31  302308000.0
2022-08-31  302317408.0
2022-09-30  292626144.0
2022-10-31  302320608.0
2022-11-30  292625824.0
2022-12-31  302326080.0


In [None]:
data

Unnamed: 0,Date,Receipt_Count
0,2021-01-01,7564766
1,2021-01-02,7455524
2,2021-01-03,7095414
3,2021-01-04,7666163
4,2021-01-05,7771289
...,...,...
360,2021-12-27,10350408
361,2021-12-28,10219445
362,2021-12-29,10313337
363,2021-12-30,10310644


In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Create the base line plot for 2021 data
fig = px.line(data, x='Date', y='Receipt_Count', title='Daily Receipts for 2021 and Predicted Monthly Receipts for 2022',
              labels={'Receipt_Count': 'Receipt Count'},
              template="plotly_dark")

# Overlay the predicted monthly data for 2022 as a red line
fig.add_trace(go.Scatter(x=forecast_df['Date'], y=forecast_df['Predictions'], mode='lines',
                         line=dict(color='red', width=2),

                         name='Predicted Monthly Receipts for 2022'))

# Enhance the layout
fig.update_layout(showlegend=True,
                  xaxis_title="Date",
                  yaxis_title="Receipt Count",
                  hovermode="x unified")

fig.show()


In [None]:
model.save("fetchChallenge.h5")


You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.

