In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [None]:
DATA_PATH = Path("/content/drive/MyDrive/final_dataset.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
time_col = "YearMonth"
target_col="Recreation Visits"

In [None]:
print("Time column:", time_col)
print("Chosen target column for forecasting:", target_col)

In [None]:
def parse_yearmonth(s):
    s = str(s).strip()
    s = s.replace('/', '-')
    # handle YYYYMM like 202305
    if len(s) == 6 and s.isdigit():
        s = s[:4] + "-" + s[4:]
    try:
        return pd.to_datetime(s, format="%Y-%m")
    except Exception:
        try:
            return pd.to_datetime(s)
        except Exception:
            return pd.NaT

df[time_col] = df[time_col].apply(parse_yearmonth)
missing_dates = df[time_col].isna().sum()
if missing_dates > 0:
    print(f"Warning: {missing_dates} rows could not be parsed into dates and will be dropped.")

In [None]:
df_clean = df.dropna(subset=[time_col, target_col]).copy()
monthly = df_clean.groupby(pd.Grouper(key=time_col, freq="MS"))[target_col].sum().sort_index().to_frame()
monthly = monthly.asfreq("MS")  # ensure continuous monthly index
monthly[target_col] = monthly[target_col].fillna(0)

In [None]:
plt.figure(figsize=(10,3))
plt.plot(monthly.index, monthly[target_col])
plt.title(f"Monthly {target_col}")
plt.xlabel("Date")
plt.ylabel(target_col)
plt.tight_layout()
plt.show()

In [None]:
if len(monthly) >= 36:
    test_periods = 12
else:
    test_periods = max(1, int(len(monthly) * 0.2))

train = monthly.iloc[:-test_periods].copy()
test = monthly.iloc[-test_periods:].copy()
print(f"Train length: {len(train)}, Test length: {len(test)} (test_periods={test_periods})")


In [None]:
# --- Evaluation helper (compatibility-safe: compute RMSE using sqrt) ---
def evaluate(true, pred):
    # Ensure alignment by index (convert to arrays)
    true_arr = np.array(true).astype(float)
    pred_arr = np.array(pred).astype(float)
    mae = mean_absolute_error(true_arr, pred_arr)
    rmse = float(np.sqrt(mean_squared_error(true_arr, pred_arr)))
    return {"MAE": mae, "RMSE": rmse}

In [None]:
results = {}
models_tried = []

# Baseline: Naive forecast (last observed value from train)
try:
    last_val = float(train[target_col].iloc[-1])
    naive_forecast = pd.Series([last_val] * len(test), index=test.index)
    results["Naive"] = evaluate(test[target_col], naive_forecast)
    models_tried.append("Naive")
except Exception as e:
    print("Naive baseline failed:", e)

# ETS (Exponential Smoothing)
try:
    from statsmodels.tsa.holtwinters import ExponentialSmoothing
    ets_model = ExponentialSmoothing(train[target_col], trend="add", seasonal="add", seasonal_periods=12).fit(optimized=True)
    ets_forecast = ets_model.forecast(steps=len(test))
    # ensure same index length/alignment
    ets_forecast = pd.Series(ets_forecast, index=test.index)
    results["ETS"] = evaluate(test[target_col], ets_forecast)
    models_tried.append("ETS")
except Exception as e:
    print("ETS unavailable or failed:", e)

# SARIMAX (basic seasonal ARIMA)
try:
    import statsmodels.api as sm
    sarimax_model = sm.tsa.SARIMAX(train[target_col], order=(1,1,1), seasonal_order=(1,1,1,12),
                                   enforce_stationarity=False, enforce_invertibility=False)
    sarimax_res = sarimax_model.fit(disp=False)
    sarimax_forecast = sarimax_res.forecast(steps=len(test))
    sarimax_forecast = pd.Series(sarimax_forecast, index=test.index)
    results["SARIMAX(1,1,1)(1,1,1,12)"] = evaluate(test[target_col], sarimax_forecast)
    models_tried.append("SARIMAX(1,1,1)(1,1,1,12)")
except Exception as e:
    print("SARIMAX unavailable or failed:", e)

# RandomForest on lag features (safe index intersection)
try:
    def make_lag_features(series, lags=[1,2,3,6,12], rolling_windows=[3,6]):
        df_l = pd.DataFrame({"y": series})
        for lag in lags:
            df_l[f"lag_{lag}"] = df_l["y"].shift(lag)
        for rw in rolling_windows:
            df_l[f"roll_mean_{rw}"] = df_l["y"].shift(1).rolling(window=rw, min_periods=1).mean()
        df_l = df_l.dropna()
        return df_l

    df_lags = make_lag_features(monthly[target_col], lags=[1,2,3,6,12], rolling_windows=[3,6])
    df_lags_train = df_lags.loc[df_lags.index.intersection(train.index)]
    df_lags_test = df_lags.loc[df_lags.index.intersection(test.index)]

    if len(df_lags_train) > 0 and len(df_lags_test) > 0:
        X_train = df_lags_train.drop(columns=["y"])
        y_train = df_lags_train["y"]
        X_test = df_lags_test.drop(columns=["y"])
        y_test = df_lags_test["y"]
        rf = RandomForestRegressor(n_estimators=200, random_state=42)
        rf.fit(X_train, y_train)
        rf_pred = pd.Series(rf.predict(X_test), index=X_test.index)
        results["RandomForest_lags"] = evaluate(y_test, rf_pred)
        models_tried.append("RandomForest_lags")
    else:
        print("Not enough data for lag-feature RandomForest (after shifting/index alignment). Skipping RF.")
except Exception as e:
    print("RandomForest approach failed:", e)

# --- Compile and show evaluation results (safe even if empty) ---
if len(results) == 0:
    print("\nNo models completed successfully. Check earlier error messages.\n")
    results_df = pd.DataFrame(columns=["Model", "MAE", "RMSE"])
else:
    results_df = pd.DataFrame(results).T.reset_index().rename(columns={"index":"Model"})
    # keep numeric columns float
    results_df["MAE"] = results_df["MAE"].astype(float)
    results_df["RMSE"] = results_df["RMSE"].astype(float)
    results_df = results_df.sort_values("MAE").reset_index(drop=True)

# Print results (works in plain python shells)
print("\nModel evaluation results (lower MAE better):\n")
print(results_df.to_string(index=False))
print("\nModels attempted:", models_tried)