In [18]:
!pip install numpy==1.26.4
!pip install pmdarima
import warnings
warnings.filterwarnings("ignore")



In [19]:
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import seaborn as sns
from pmdarima import auto_arima
import os
%matplotlib inline

In [21]:
csv_path = '../data/cleaned/final_merged_dataset_2.csv'
df = pd.read_csv(csv_path)
print(df.head(10))


    ticker        date                title    distributor   gross  \
0     PARA  2016-06-02  10 Cloverfield Lane  Paramount Pi…   11414   
1  Private  2006-09-04          10th & Wolf      ThinkFilm    1791   
2     6758  2009-05-25                   12  Sony Picture…     344   
3     6758  2009-05-25                   12  Sony Picture…     344   
4      DIS  2009-05-25            12 Rounds  20th Century…    4832   
5      WBD  2018-03-29            12 Strong   Warner Bros.    4502   
6     SONY  2004-06-03       13 Going On 30  Sony Pictures  115000   
7     AMZN  2007-09-03                 1408            MGM   38250   
8      WBD  2001-04-05           15 Minutes       New Line   89000   
9      WBD  2001-04-05           15 Minutes       New Line   89000   

   percent_yd  percent_lw  theaters  per_theater  total_gross  ...  \
0        0.32       -0.12     120.0         95.0     72082999  ...   
1        0.00        0.00       6.0        299.0        49783  ...   
2        0.00      

In [16]:
# Time Series Forecasting Total Daily Theater Demand

# 1. Prepare data: aggregate daily total theater demand (sum revenue or admissions)
df['date'] = pd.to_datetime(df['date'])
df_daily = df.groupby('date').agg({'revenue': 'sum'}).reset_index()
df_daily = df_daily.sort_values('date')

# 2. Plot historical total demand
plt.figure(figsize=(14, 5))
plt.plot(df_daily['date'], df_daily['revenue'])
plt.title("Total Daily Theater Demand (All Theaters)")
plt.xlabel("Date")
plt.ylabel("Total Revenue")
plt.show()

# 3. Decompose time series to observe trend/seasonality
decompose = sm.tsa.seasonal_decompose(df_daily.set_index('date')['revenue'], model='additive', period=7)
decompose.plot()
plt.suptitle('Additive Decomposition of Total Daily Theater Demand')
plt.show()

# 4. Fit SARIMA using auto_arima to detect order & seasonality
model = auto_arima(
    df_daily['revenue'],
    start_p=1, start_q=1,
    max_p=3, max_q=3,
    m=7,                        # weekly seasonality
    start_P=0, seasonal=True,
    d=1, D=1, trace=True,
    error_action='ignore',  
    suppress_warnings=True, 
    stepwise=True
)

print("Best SARIMA model:", model.summary())

# 5. Forecast the next 30 days
n_periods = 30
forecast, conf_int = model.predict(n_periods=n_periods, return_conf_int=True)

future_dates = pd.date_range(df_daily['date'].iloc[-1] + pd.Timedelta(days=1), periods=n_periods)

# 6. Plot historical + forecast demand
plt.figure(figsize=(14, 5))
plt.plot(df_daily['date'], df_daily['revenue'], label='Historical', color='blue')
plt.plot(future_dates, forecast, label='Forecast', color='red')
plt.fill_between(future_dates, conf_int[:, 0], conf_int[:, 1], color='pink', alpha=0.2, label='95% CI')
plt.xlabel('Date')
plt.ylabel('Total Revenue')
plt.title('Forecast of Total Daily Theater Demand (Next 30 Days)')
plt.legend()
plt.show()


KeyError: 'date'