In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

train = pd.read_csv('train (3).csv')
test = pd.read_csv('test_sfo_processed-2.csv')

timestamps = train['week_beg'].values * 100
test_timestamps = [timestamps[-1] + 604800 * i for i in range(1, 30)]

train[train.select_dtypes('object').columns] = train.loc[:, train.select_dtypes('object').columns].replace(' ', 0).astype('float')
test[test.select_dtypes('object').columns] = test.loc[:, test.select_dtypes('object').columns].replace(' ', 0).astype('float')

train['week_number'] = train.index
train['revenue'] = train['revenue'].astype(float)

In [None]:
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove('week_beg')
num_cols.remove('revenue')

ss = StandardScaler()
train[num_cols] = ss.fit_transform(train[num_cols])

ss_target = StandardScaler()
train[['revenue']] = ss_target.fit_transform(train[['revenue']])

y_train_sarima = train['revenue']

X_train_catboost, y_train_catboost = train.drop(columns=['week_beg', 'revenue']), train['revenue']


In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y, window=52, num_preds=29):
        self.X = X
        self.y = y
        self.window = window
        self.num_preds = num_preds

    def __len__(self):
        return self.X.__len__() - self.window + 1

    def __getitem__(self, index):
        return (self.X[index:index+self.window], self.y[index+self.window:index+self.window+self.num_preds])

sliding_window = 104
train_dataset = TimeSeriesDataset(torch.tensor(train[num_cols].values), torch.tensor(train['revenue'].values), window=sliding_window)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)


class LstmTimeSeries(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LstmTimeSeries, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

input_size = len(num_cols)
hidden_size = 128
num_layers = 1
output_size = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
lstm_model = LstmTimeSeries(input_size, hidden_size, num_layers, output_size).to(device)
lstm_model.load_state_dict(torch.load('lstm_model.pth'))
lstm_model.eval()

catboost_model = CatBoostRegressor()
catboost_model.fit(X_train_catboost, y_train_catboost)


sarima_model = SARIMAX(y_train_sarima, order=(1, 1, 1), seasonal_order=(0, 1, 1, 52))
sarima_result = sarima_model.fit()
sarima_pred = sarima_result.predict(start=len(train), end=len(train) + len(test) - 1, dynamic=False)


lstm_pred_unscaled = []
for x, _ in train_loader:
    x = x.float().to(device)
    lstm_pred_unscaled.extend(lstm_model(x).detach().cpu().numpy().flatten())
lstm_pred_unscaled = np.array(lstm_pred_unscaled)

catboost_pred_unscaled = catboost_model.predict(test.drop(columns=['week_beg']))
sarima_pred_unscaled = sarima_result.predict(start=len(train), end=len(train) + len(test) - 1, dynamic=False)

ensemble_pred_unscaled = (sarima_pred_unscaled + lstm_pred_unscaled + catboost_pred_unscaled) / 3.0


weeks = [datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in timestamps]
test_weeks = [datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in test_timestamps]

plt.figure(figsize=(13, 10))
plt.plot(weeks, train['revenue'] / 10**6, label='Historical revenue')
plt.plot(test_weeks, ensemble_pred_unscaled / 10**6, color='purple', label='Ensemble prediction')
plt.title('Historical and Ensemble Predicted Revenue')
plt.xlabel('Date')
plt.ylabel('Revenue (Millions)')
plt.legend()
plt.xticks((weeks + test_weeks)[::10], fontsize=9, rotation=45)
plt.show()
