In [6]:
import os
import traceback
from datetime import timedelta

import tkinter as tk
from tkinter import filedialog, messagebox, ttk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# ----------------- Metrics -----------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    denom = np.where(np.abs(y_true) < 1e-9, 1e-9, y_true)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100

# ----------------- Forecast helper (robust) -----------------
def forecast_next_days(model, df_full, feature_cols, n_days=30):
    if "Close" not in df_full.columns:
        raise ValueError("Dataframe must contain 'Close' column for forecasting.")

    preds = []
    close_history = list(df_full["Close"].values)
    last_volume = int(df_full["Volume"].iloc[-1]) if "Volume" in df_full.columns else 0

    for i in range(int(n_days)):
        temp = {}
        # lags
        for l in [1,2,3,7,14,30]:
            temp[f"lag_{l}"] = close_history[-l] if len(close_history) >= l else close_history[0]

        last_vals = close_history[-30:]
        temp["MA_7"] = float(np.mean(last_vals[-7:])) if len(last_vals) >= 1 else float(close_history[-1])
        temp["MA_30"] = float(np.mean(last_vals)) if len(last_vals) >= 1 else float(close_history[-1])
        temp["STD_7"] = float(np.std(last_vals[-7:])) if len(last_vals) >= 1 else 0.0
        temp["return_1"] = float((last_vals[-1] - last_vals[-2]) / (last_vals[-2] + 1e-9)) if len(last_vals) >= 2 else 0.0
        temp["vol_lag1"] = last_volume

        temp_df = pd.DataFrame([temp])
        # ensure columns and order match feature_cols
        for c in feature_cols:
            if c not in temp_df.columns:
                temp_df[c] = 0.0
        temp_df = temp_df[feature_cols]

        pred = float(model.predict(temp_df)[0])
        preds.append(pred)
        close_history.append(pred)

    return preds

# ----------------- Deterministic feature ordering -----------------
def sorted_feature_cols(cols):
    def key(c):
        if c.startswith("lag_"):
            try:
                return (0, int(c.split("_")[1]))
            except:
                return (0, 999)
        order_map = {"MA_7": (1,7), "MA_30": (1,30), "STD_7": (2,7), "return_1": (3,1), "vol_lag1": (4,1)}
        return order_map.get(c, (9,0))
    return sorted(cols, key=key)



In [6]:

# ----------------- Simple Tkinter App -----------------
class SimpleCrudeApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Crude Oil Forecast — Simple macOS UI")
        self.df = None
        self.model = None
        self.feature_cols = None
        self.model_path = "xgb_crude_model.joblib"
        self._build_ui()

    def _build_ui(self):
        frame = ttk.Frame(self.root, padding=12)
        frame.pack(fill=tk.BOTH, expand=True)

        ttk.Label(frame, text="Crude Oil Forecasting", font=("Arial", 16)).pack(pady=(0,8))

        ttk.Button(frame, text="Upload CSV", command=self.upload_csv).pack(fill=tk.X, pady=4)
        ttk.Button(frame, text="Preprocess", command=self.preprocess).pack(fill=tk.X, pady=4)
        ttk.Button(frame, text="Train Model", command=self.train_model).pack(fill=tk.X, pady=4)
        ttk.Button(frame, text="Save Model", command=self.save_model).pack(fill=tk.X, pady=4)
        ttk.Button(frame, text="Load Model", command=self.load_model).pack(fill=tk.X, pady=4)

        param_frame = ttk.Frame(frame)
        param_frame.pack(fill=tk.X, pady=(8,6))
        ttk.Label(param_frame, text="Days to forecast:").pack(side=tk.LEFT)
        self.day_entry = ttk.Entry(param_frame, width=6)
        self.day_entry.insert(0, "30")
        self.day_entry.pack(side=tk.LEFT, padx=(6,10))
        ttk.Button(param_frame, text="Generate Forecast", command=self.generate_forecast).pack(side=tk.LEFT, padx=4)
        ttk.Button(param_frame, text="Show Plots", command=self.show_plots).pack(side=tk.LEFT, padx=4)

        self.log_box = tk.Text(frame, height=14, width=96)
        self.log_box.pack(pady=(8,0), fill=tk.BOTH, expand=True)

    def log(self, msg):
        self.log_box.insert(tk.END, msg + "\n")
        self.log_box.see(tk.END)

    def upload_csv(self):
        try:
            path = filedialog.askopenfilename(filetypes=[("CSV files","*.csv"),("All files","*.*")])
            if not path:
                return
            self.df = pd.read_csv(path)
            self.log(f"Loaded CSV: {os.path.basename(path)} (rows: {len(self.df)})")
        except Exception as e:
            messagebox.showerror("Error", f"Failed to load CSV:\n{e}")
            self.log(traceback.format_exc())

    def preprocess(self):
        try:
            if self.df is None:
                raise ValueError("Upload dataset first.")
            df = self.df.copy()
            # normalize column names
            df.columns = df.columns.str.strip().str.replace("/", "_", regex=False).str.replace(" ", "", regex=False)
            # handle Close/Last variants
            if "Close_Last" in df.columns and "Close" not in df.columns:
                df.rename(columns={"Close_Last": "Close"}, inplace=True)
            if "Close/Last" in df.columns and "Close" not in df.columns:
                df.rename(columns={"Close/Last": "Close"}, inplace=True)
            # required
            required = {"Date", "Close", "Volume"}
            if not required.issubset(set(df.columns)):
                missing = required - set(df.columns)
                raise ValueError(f"Missing required columns: {missing}")

            df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
            df = df.sort_values("Date")
            df.set_index("Date", inplace=True)
            df = df[df["Close"] > 0]

            for l in [1,2,3,7,14,30]:
                df[f"lag_{l}"] = df["Close"].shift(l)

            df["MA_7"] = df["Close"].rolling(7).mean().shift(1)
            df["MA_30"] = df["Close"].rolling(30).mean().shift(1)
            df["STD_7"] = df["Close"].rolling(7).std().shift(1)
            df["return_1"] = df["Close"].pct_change().shift(1)
            df["vol_lag1"] = df["Volume"].shift(1)

            df = df.dropna()
            chosen = [c for c in df.columns if (c.startswith("lag_") or c.startswith("MA_") or c.startswith("STD_") or c.startswith("return_") or c == "vol_lag1")]
            self.feature_cols = sorted_feature_cols(chosen)
            self.df = df

            self.log(f"Preprocessing completed. Rows after preprocess: {len(df)}")
            self.log(f"Features: {self.feature_cols}")
        except Exception as e:
            messagebox.showerror("Error", str(e))
            self.log(traceback.format_exc())

    def train_model(self):
        try:
            if self.df is None or not self.feature_cols:
                raise ValueError("Preprocess the data first.")
            df = self.df.copy()
            X = df[self.feature_cols].values
            y = df["Close"].values
            split = int(len(df) * 0.8)
            if split < 10:
                raise ValueError("Not enough data after preprocess to train.")
            X_train, X_test = X[:split], X[split:]
            y_train, y_test = y[:split], y[split:]

            self.log("Training XGBoost model... (this may take a moment)")
            model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            self.model = model
            self.log("Model trained successfully.")
            self.log(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
            self.log(f"RMSE: {rmse(y_test, y_pred):.4f}")
            self.log(f"MAPE: {mape(y_test, y_pred):.4f}")
            self.log(f"R²: {r2_score(y_test, y_pred):.4f}")
        except Exception as e:
            messagebox.showerror("Error", str(e))
            self.log(traceback.format_exc())

    def save_model(self):
        try:
            if self.model is None:
                raise ValueError("Train a model first.")
            joblib.dump({"model": self.model, "feature_cols": self.feature_cols}, self.model_path)
            self.log(f"Model saved to {self.model_path}")
        except Exception as e:
            messagebox.showerror("Error", str(e))
            self.log(traceback.format_exc())

    def load_model(self):
        try:
            if not os.path.exists(self.model_path):
                raise FileNotFoundError(f"No saved model at {self.model_path}")
            data = joblib.load(self.model_path)
            self.model = data.get("model")
            self.feature_cols = data.get("feature_cols")
            self.log("Model loaded.")
            self.log(f"Feature cols loaded: {self.feature_cols}")
        except Exception as e:
            messagebox.showerror("Error", str(e))
            self.log(traceback.format_exc())

    def generate_forecast(self):
        try:
            if self.model is None:
                raise ValueError("Train or load a model first.")
            days = int(self.day_entry.get()) if self.day_entry.get().strip().isdigit() else 30
            preds = forecast_next_days(self.model, self.df, self.feature_cols, days)
            self.log(f"Forecast for next {days} days:")
            for i, p in enumerate(preds, 1):
                self.log(f"Day {i}: {p:.2f}")
        except Exception as e:
            messagebox.showerror("Error", str(e))
            self.log(traceback.format_exc())

    def show_plots(self):
        try:
            if self.model is None:
                raise ValueError("Train or load model first.")
            df = self.df.copy()
            X = df[self.feature_cols].values
            y = df["Close"].values
            split = int(len(df) * 0.8)
            y_test = y[split:]
            y_pred = self.model.predict(X[split:])

            plt.figure(figsize=(12,6))
            plt.plot(df.index[split:], y_test, label="Actual", linewidth=2)
            plt.plot(df.index[split:], y_pred, label="Predicted", linestyle="--")
            plt.title("Actual vs Predicted")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()

            # 30-day forecast plot
            preds = forecast_next_days(self.model, df, self.feature_cols, 30)
            future_dates = pd.date_range(df.index[-1] + timedelta(days=1), periods=30)

            plt.figure(figsize=(12,6))
            last_n = 60 if len(df) > 60 else len(df)
            plt.plot(df.index[-last_n:], df["Close"][-last_n:], label=f"Last {last_n} Days")
            plt.plot(future_dates, preds, label="30-Day Forecast", linestyle="--")
            plt.title("30-Day Forecast")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()
        except Exception as e:
            messagebox.showerror("Error", str(e))
            self.log(traceback.format_exc())


In [None]:
# ----------------- Run app -----------------
if __name__ == "__main__":
    root = tk.Tk()
    app = SimpleCrudeApp(root)
    root.mainloop()
