In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime

train = pd.read_csv('train (3).csv')
test = pd.read_csv('test_sfo_processed-2.csv')

timestamps = train['week_beg'].values * 100
test_timestamps = [timestamps[-1] + 604800 * i for i in range(1, 30)]

train[train.select_dtypes('object').columns] = train.loc[:, train.select_dtypes('object').columns].replace(' ', 0).astype('float')
test[test.select_dtypes('object').columns] = test.loc[:, test.select_dtypes('object').columns].replace(' ', 0).astype('float')

train.insert(0, 'week_number', train.index.values)


In [None]:
for col in ['_competitor_rating',
            '_competitor_coverage',
            '_competitor_money',
            '_competitor_digital',
            '_competitor_sponsorship_money',
            '_competitor_oon_money',
            '_competitor_radio_money',
            '_competitor_total']:

    train['competitors_'+col.rsplit('_', 1)[-1]] = train[[str(i) + col for i in range(1, 14)]].sum(axis=1)
    train.drop(columns=[str(i) + col for i in range(1, 14)], inplace=True)

col = '_competitor_tv_reg'
train['competitors_tv_reg'] = train[[str(i) + '_competitor_tv_reg' for i in [3, 5, 6, 7, 9, 12]]].sum(axis=1)
train.drop(columns=[str(i) + '_competitor_tv_reg' for i in [3, 5, 6, 7, 9, 12]], inplace=True)


In [None]:
for col in ['_video_rating',
            '_video_money',
            '_video_coverage_5']:

    train.drop(columns=[str(i) + col for i in range(1, 5)], inplace=True)

num_cols = train.columns.values.tolist()
num_cols.remove('week_beg')
num_cols.remove('revenue')



In [None]:
target_col = ['revenue']
train = train[num_cols + target_col]

ss = StandardScaler()
train[num_cols] = ss.fit_transform(train[num_cols])

ss_target = StandardScaler()
train[['revenue']] = ss_target.fit_transform(train[['revenue']])



In [None]:
X_train, y_train = train.drop(columns=['revenue']), train['revenue']

sarima_model = SARIMAX(y_train, order=(1, 1, 1), seasonal_order=(0, 1, 1, 52))
sarima_result = sarima_model.fit()

sarima_pred = sarima_result.predict(start=len(train), end=len(train) + len(test) - 1, dynamic=False)

X_train_catboost, y_train_catboost = train.drop(columns=['revenue']), train['revenue']

catboost_model = CatBoostRegressor()
catboost_model.fit(X_train_catboost, y_train_catboost)

catboost_pred = catboost_model.predict(test.drop(columns=['week_beg']))

sarima_pred_unscaled = ss_target.inverse_transform(sarima_pred.reshape(-1, 1)).flatten()
catboost_pred_unscaled = ss_target.inverse_transform(catboost_pred.reshape(-1, 1)).flatten()
train_revenue = ss_target.inverse_transform(train['revenue'].values.reshape(-1, 1)).flatten()

weeks = [datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in timestamps]
test_weeks = [datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in test_timestamps]

plt.figure(figsize=(13,10))
plt.plot(weeks, train_revenue/10**6, label='Historical revenue')
plt.plot(test_weeks, sarima_pred_unscaled/10**6, color='blue', label='SARIMA prediction')
plt.plot(test_weeks, catboost_pred_unscaled/10**6, color='green', label='CatBoost prediction')
plt.title('Historical and Predicted Revenue')
plt.xlabel('Date')
plt.ylabel('Revenue (Millions)')
plt.legend()
plt.xticks((weeks+test_weeks)[::10] , fontsize=9, rotation=45)
plt.show()