In [10]:
'''
import pmdarima as pm
import os
import pandas as pd

clean_path = os.path.join("..", "data", "processed")
tickers = [ticker for ticker in os.listdir(clean_path) if ticker.endswith('.csv')]
forecast_results = {}

# Forecasting next n days for each ticker
for ticker in tickers:
    file_path = os.path.join(clean_path, ticker)
    cols = pd.read_csv(file_path, nrows=0).columns
    if ticker == "import_data.csv":
        continue
    data = pd.read_csv(file_path, parse_dates=["Date"], index_col="Date", low_memory=False)
    data = data.asfreq('B').ffill()
    returns = data["Return"].dropna()
    model = pm.auto_arima(returns, start_p=0, d=None, start_q=0, max_p=3, max_q=3, seasonal=False, stepwise=True, suppress_warnings=True, n_jobs=1)
    forecast_vals = model.predict(steps=10)
    forecast_i = pd.bdate_range(start=returns.index[-1] + pd.offsets.BDay(1), periods=10, freq='B')
    forecast = pd.Series(forecast_vals, index=forecast_i)
    forecast_results[ticker] = forecast
    print("finished!")
print(forecast_results)

# Saving the forecast results to a CSV file 
forecast_df = pd.DataFrame(forecast_results)
output_path = os.path.join("..", "data", "forecasts", "forecasted_returns.csv")
forecast_df.to_csv(output_path)  
    
# Getting the diagram for how it looks
# Printing a prediction for whether or not its a good idea to invest or not
'''



In [11]:
## use matplotlib to plot the results on a graph to be used in the dashboard
## on dashboard when user selects a ticker, show the graph of the returns and the forecast



In [12]:
import os, json, numpy as np, pandas as pd
from tqdm import tqdm
import pmdarima as pm

DATA = "../data/processed"              # your existing cleaned files
ART  = "../artifacts/forecasts"         # where we’ll save outputs
os.makedirs(ART, exist_ok=True)

H = 10                                  # forecast horizon (business days)



In [13]:
def load_returns_and_price(ticker: str):
    """Load Return + last Close; enforce BDay index; forward-fill gaps."""
    df = pd.read_csv(os.path.join(DATA, f"{ticker}.csv"),
                     parse_dates=["Date"], index_col="Date", low_memory=False)
    df = df.sort_index()
    s = df["Return"].asfreq("B").ffill().dropna()
    p0 = df["Close"].iloc[-1]
    return s, float(p0)

def arima_mean_path(returns: pd.Series, h: int = H) -> pd.Series:
    """
    Fast baseline: ARIMA(1,0,1) on returns (stationary). 
    If anything fails, fall back to zeros (μ≈0).
    """
    try:
        # Statsmodels-style ARIMA through pmdarima’s wrapper (fast)
        model = pm.ARIMA(order=(1,0,1)).fit(returns.values)
        mu = model.predict(n_periods=h)  # point forecasts of daily returns
    except Exception:
        mu = np.zeros(h)

    idx = pd.bdate_range(returns.index[-1] + pd.offsets.BDay(1), periods=h)
    return pd.Series(mu, index=idx)

def list_tickers():
    return sorted([f[:-4] for f in os.listdir(DATA) if f.endswith(".csv")])


In [14]:
tickers = list_tickers()
mean_paths = {}   # {ticker: Series length H}
last_close = {}   # {ticker: float}

for t in tqdm(tickers, desc="Forecasting", ncols=80):
    r, p0 = load_returns_and_price(t)
    mu = arima_mean_path(r, H)
    mean_paths[t] = mu
    last_close[t] = p0

# Assemble into tidy DataFrames
mean_df = pd.DataFrame(mean_paths).T         # (ticker x date_index)
# Give simple column names T+1..T+H for convenience
mean_df.columns = [f"T+{i+1}" for i in range(mean_df.shape[1])]
last_close_s = pd.Series(last_close).rename("LastClose")
universe = mean_df.index.tolist()

print(f"Tickers: {len(universe)}; Horizon: {H}")
mean_df.head()


Forecasting:   0%|                                      | 0/109 [00:00<?, ?it/s]


KeyError: 'Close'

In [None]:
mean_path = f"{ART}/mean_returns_10d.parquet"
last_close_path = f"{ART}/last_close.parquet"
diag_path = f"{ART}/diag.json"

mean_df.to_parquet(mean_path)     # (ticker x H) daily return forecasts
last_close_s.to_parquet(last_close_path)

with open(diag_path, "w") as f:
    json.dump({
        "horizon": H,
        "method": "ARIMA(1,0,1) on daily returns",
        "universe": universe
    }, f, indent=2)

print("Saved:")
print("  -", mean_path)
print("  -", last_close_path)
print("  -", diag_path)


In [None]:
import matplotlib.pyplot as plt

sample = universe[0]
mu = mean_df.loc[sample]
plt.figure()
mu.T.plot(marker="o")
plt.title(f"{sample} — 10-Day Return Forecast (ARIMA 1,0,1)")
plt.ylabel("Daily expected return")
plt.xlabel("Horizon (T+)")
plt.grid(True)
plt.show()


In [None]:
'''
plt.figure(figsize=(26, 6))
data['Adj Close'].plot(x = 250, y = 10, linewidth = 0.8)
plt.show()

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
acf_original = plot_acf(data['Adj Close'])
pacf_original = plot_pacf(data['Adj Close'])

# adf test
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(data['Adj Close'])
print("ADF Statistic:", adf_result[1])
# fails to reject null hypothesis at 5% sig level, non-stationary

transforming to stationary (stabilise mean)
df_train_diff = data['Adj Close'].diff().dropna()
df_train_diff.plot()

acf_diff = plot_acf(df_train_diff)
pacf_diff = plot_pacf(df_train_diff)

adf_result_diff = adfuller(df_train_diff)
print("ADF Statistic (differenced):", adf_result_diff[1])
# reject null hypothesis at a 5% significance level, stationary

# make time series predictions

import matplotlib.pyplot as plt
residuals = model_fit.resid[1:]
fig, ax = plt.subplots(1,2)
residuals.plot(title = "Residuals", ax = ax[0])
residuals.plot(kind = 'kde', title = "Density", ax = ax[1])
plt.show()


plot_acf(residuals)
plot_pacf(residuals)
plt.show()

'''

'\nplt.figure(figsize=(26, 6))\ndata[\'Adj Close\'].plot(x = 250, y = 10, linewidth = 0.8)\nplt.show()\n\nfrom statsmodels.graphics.tsaplots import plot_acf, plot_pacf\nacf_original = plot_acf(data[\'Adj Close\'])\npacf_original = plot_pacf(data[\'Adj Close\'])\n\n# adf test\nfrom statsmodels.tsa.stattools import adfuller\nadf_result = adfuller(data[\'Adj Close\'])\nprint("ADF Statistic:", adf_result[1])\n# fails to reject null hypothesis at 5% sig level, non-stationary\n\ntransforming to stationary (stabilise mean)\ndf_train_diff = data[\'Adj Close\'].diff().dropna()\ndf_train_diff.plot()\n\nacf_diff = plot_acf(df_train_diff)\npacf_diff = plot_pacf(df_train_diff)\n\nadf_result_diff = adfuller(df_train_diff)\nprint("ADF Statistic (differenced):", adf_result_diff[1])\n# reject null hypothesis at a 5% significance level, stationary\n\n# make time series predictions\n\nimport matplotlib.pyplot as plt\nresiduals = model_fit.resid[1:]\nfig, ax = plt.subplots(1,2)\nresiduals.plot(title = "Re