In [4]:
# Creating and demonstrating the requested functions for precomputed time-series analysis.
# This code will:
# 1. Define load_data(file_path, col, date_col='Date') that loads the CSV and prepares a DataFrame.
# 2. Define precompute_basic_features(df, price_col, windows=[7,30], rsi_window=14) that computes:
#    - rolling mean, median, std for the windows
#    - daily returns and cumulative returns
#    - SMA (rolling mean) and EMA
#    - rolling correlation with Volume (if present)
#    - volatility (std of returns)
#    - RSI and MACD
# 3. Define save_pre_analysis(df, original_path) to save to a new CSV with suffix "_pre_analysis".
# 4. Demonstrate running these functions on /mnt/data/goog_stock_data.csv using 'Close' if present.
# Note: this environment has no internet access. This will run locally and produce a CSV in /mnt/data.

import pandas as pd
import numpy as np
import os
from typing import List

def load_data(file_path: str, col: str, date_col: str = "Date", parse_dates: bool = True) -> pd.DataFrame:
    """
    Load CSV into a pandas DataFrame, set the date column as a datetime index (if present),
    and ensure the requested column exists.
    
    Parameters
    ---------
    file_path : str
        Path to the CSV file.
    col : str
        Name of the price column to focus on (e.g., 'Close').
    date_col : str
        Name of the date column. Defaults to 'Date'.
    parse_dates : bool
        Whether to parse the date column as datetime.
    
    Returns
    -------
    pd.DataFrame
        Loaded DataFrame with a datetime index (if date_col exists) and containing the requested column.
    """
    df = pd.read_csv(file_path)
    df["Date"] = pd.to_datetime(df["Date"], utc=True)
    df = df.set_index("Date").sort_index()
    # parse date column and set index if exists
    if parse_dates and date_col in df.columns:
        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
        if df[date_col].isnull().all():
            # failed parsing, keep as-is
            df = df.copy()
        else:
            df = df.set_index(date_col).sort_index()
    # ensure the requested column exists; if not, try to pick a sensible default
    if col not in df.columns:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        fallback = numeric_cols[0] if numeric_cols else None
        raise ValueError(f"Column '{col}' not found in file. Numeric columns available: {numeric_cols}. "
                         f"Consider using one of those (e.g. '{fallback}').")
    return df

def _ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()

def precompute_basic_features(df: pd.DataFrame, price_col: str,
                              windows: List[int] = [7, 30],
                              rsi_window: int = 14) -> pd.DataFrame:
    """
    Compute basic statistical/time-series features described by the user:
      - rolling mean, median, std for windows
      - daily returns and cumulative returns
      - SMA (rolling mean) and EMA
      - rolling correlations with Volume if present
      - volatility (std of returns)
      - RSI and MACD
    
    Returns a new DataFrame with added columns.
    """
    df = df.copy()
    price = df[price_col].astype(float)
    
    # Daily returns
    df['daily_return'] = price.pct_change()
    # Cumulative return from start
    df['cumulative_return'] = (1 + df['daily_return']).cumprod() - 1

    # Rolling stats and MAs/EMAs
    for w in windows:
        df[f'rolling_mean_{w}'] = price.rolling(window=w, min_periods=1).mean()
        df[f'rolling_median_{w}'] = price.rolling(window=w, min_periods=1).median()
        df[f'rolling_std_{w}'] = price.rolling(window=w, min_periods=1).std(ddof=0)
        # EMA and SMA (SMA = rolling mean)
        df[f'ema_{w}'] = _ema(price, span=w)
        df[f'sma_{w}'] = df[f'rolling_mean_{w}']

    # Volatility: std of returns over windows
    for w in windows:
        df[f'volatility_{w}'] = df['daily_return'].rolling(window=w, min_periods=1).std(ddof=0)

    # Rolling correlation with volume if volume exists
    if 'Volume' in df.columns:
        for w in windows:
            df[f'rolling_corr_price_volume_{w}'] = price.rolling(window=w, min_periods=1).corr(df['Volume'])

    # RSI implementation (Wilder's smoothing)
    delta = price.diff()
    up = delta.clip(lower=0.0)
    down = -1 * delta.clip(upper=0.0)
    # Use exponential moving average of gains/losses (Wilder's)
    roll_up = up.ewm(alpha=1/rsi_window, adjust=False, min_periods=1).mean()
    roll_down = down.ewm(alpha=1/rsi_window, adjust=False, min_periods=1).mean()
    rs = roll_up / (roll_down.replace(0, np.nan))
    df['rsi'] = 100 - (100 / (1 + rs))
    df['rsi'] = df['rsi'].fillna(0)

    # MACD: EMA12 - EMA26 and signal line 9-day EMA of MACD
    ema_short = _ema(price, span=12)
    ema_long = _ema(price, span=26)
    df['macd'] = ema_short - ema_long
    df['macd_signal'] = _ema(df['macd'].fillna(0), span=9)
    df['macd_histogram'] = df['macd'] - df['macd_signal']
    
    # Momentum: difference between current price and price n periods ago (we'll include for windows)
    for w in windows:
        df[f'momentum_{w}'] = price - price.shift(w)
    
    # Clean up infinite values if any and keep numeric columns consistent
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df

def save_pre_analysis(df: pd.DataFrame, original_path: str) -> str:
    """
    Save dataframe to a new CSV with suffix "_pre_analysis" before the file extension.
    Returns the path to the saved CSV.
    """
    base, ext = os.path.splitext(original_path)
    new_path = f"{base}_pre_analysis{ext}"
    df.to_csv(new_path, index=True)
    return new_path


In [5]:
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from langchain_core.documents import Document
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# B. Trend & Seasonality
# -------------------------
def compute_trend_seasonality(df: pd.DataFrame, price_col: str, model: str = 'additive', period: int = 30) -> pd.DataFrame:
    df = df.copy()
    df = df.dropna(subset=[price_col])
    result = seasonal_decompose(df[price_col], model=model, period=period, extrapolate_trend='freq')
    df['trend'] = result.trend
    df['seasonal'] = result.seasonal
    df['residual'] = result.resid
    return df

# -------------------------
# C. Anomaly Detection
# -------------------------
def detect_anomalies(df: pd.DataFrame, price_col: str, z_thresh: float = 3.0) -> pd.DataFrame:
    df = df.copy()
    # Z-score method
    df['zscore'] = (df[price_col] - df[price_col].mean()) / df[price_col].std(ddof=0)
    df['zscore_anomaly'] = df['zscore'].abs() > z_thresh

    # Isolation Forest
    scaler = StandardScaler()
    scaled_vals = scaler.fit_transform(df[[price_col]].fillna(0))
    iso = IsolationForest(contamination=0.01, random_state=42)
    df['isolation_anomaly'] = iso.fit_predict(scaled_vals) == -1
    return df

# # -------------------------
# # D. Forecasting
# # -------------------------
# def forecast_arima(df: pd.DataFrame, price_col: str, steps: int = 7):
#     df = df.dropna(subset=[price_col])
#     model = ARIMA(df[price_col], order=(5,1,0))
#     model_fit = model.fit()
#     forecast = model_fit.forecast(steps=steps)
#     forecast_df = pd.DataFrame({
#         'forecast_date': pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=steps),
#         'forecast_arima': forecast.values
#     })
#     return forecast_df

# def forecast_lstm(df: pd.DataFrame, price_col: str, steps: int = 7, epochs: int = 5):
#     from sklearn.preprocessing import MinMaxScaler
#     from numpy import array

#     prices = df[price_col].dropna().values.reshape(-1, 1)
#     scaler = MinMaxScaler()
#     scaled = scaler.fit_transform(prices)

#     X, y = [], []
#     for i in range(len(scaled)-1):
#         X.append(scaled[i])
#         y.append(scaled[i+1])
#     X, y = np.array(X), np.array(y)
#     X = X.reshape((X.shape[0], 1, X.shape[1]))

#     model = Sequential([
#         LSTM(50, activation='relu', input_shape=(1,1)),
#         Dense(1)
#     ])
#     model.compile(optimizer='adam', loss='mse')
#     model.fit(X, y, epochs=epochs, verbose=0)

#     # forecast
#     last_val = scaled[-1].reshape((1,1,1))
#     preds = []
#     for _ in range(steps):
#         next_pred = model.predict(last_val, verbose=0)
#         preds.append(next_pred[0,0])
#         last_val = next_pred.reshape((1,1,1))
#     preds = scaler.inverse_transform(np.array(preds).reshape(-1,1)).flatten()
#     forecast_df = pd.DataFrame({
#         'forecast_date': pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=steps),
#         'forecast_lstm': preds
#     })
#     return forecast_df

# =====================================================
# E. Summary for Embedding (with Forecast Integration)
# =====================================================
from langchain_core.documents import Document
import pandas as pd
import numpy as np
import os

def create_langchain_summaries(
    df: pd.DataFrame,
    price_col: str,
    freq: str = 'M',
    arima_forecast: pd.DataFrame = None,
    sarima_forecast: pd.DataFrame = None,
    lstm_forecast: pd.DataFrame = None
) -> list:
    """
    Create multiple LangChain Documents summarizing trends/anomalies per time chunk (e.g., month),
    and integrate model forecasts (ARIMA, SARIMA, LSTM) corresponding to the same or next period.

    Enhancements:
    - Adds explicit time tags (Year, Month, Period Range)
    - Includes volatility interpretation (low/moderate/high)
    - Merges forecasts if their dates fall within or just after the period
    - Automatically attaches visualization file names (if exist)
    """

    docs = []

    # ✅ Ensure DataFrame has DatetimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        date_cols = [c for c in df.columns if 'date' in c.lower()]
        if date_cols:
            df[date_cols[0]] = pd.to_datetime(df[date_cols[0]], errors='coerce')
            df = df.set_index(date_cols[0])
        else:
            raise ValueError("No DatetimeIndex or 'Date' column found for resampling.")

    # -------------------------------
    # Group data by time frequency
    # -------------------------------
    grouped = df.resample(freq)

    # Helper: safely fetch forecast values for a given date range
    def get_forecast_for_period(forecast_df, start_date, end_date):
        if forecast_df is None or "forecast_date" not in forecast_df.columns:
            return None
        mask = (forecast_df["forecast_date"] >= pd.Timestamp(start_date)) & (
            forecast_df["forecast_date"] <= pd.Timestamp(end_date)
        )
        if mask.any():
            values = forecast_df.loc[mask, "forecast_value"]
            if not values.empty:
                return round(values.mean(), 4)
        return None

    # -------------------------------
    # Iterate through each monthly chunk
    # -------------------------------
    for period, group in grouped:
        if len(group) < 5:
            continue

        # --- Basic stats ---
        mean_price = group[price_col].mean()
        vol = group["daily_return"].std() if "daily_return" in group else group[price_col].pct_change().std()
        anomalies = group["zscore_anomaly"].sum() if "zscore_anomaly" in group else 0
        trend_desc = "increasing" if group["trend"].iloc[-1] > group["trend"].iloc[0] else "decreasing"

        # --- Time context ---
        start_date = group.index.min().date()
        end_date = group.index.max().date()
        year = period.year
        month = period.month
        month_name = period.strftime("%B")

        # --- Interpret volatility level ---
        if vol < 0.01:
            vol_level = "low"
        elif vol < 0.03:
            vol_level = "moderate"
        else:
            vol_level = "high"

        # --- Attach model forecasts for this period or the next ---
        arima_val = get_forecast_for_period(arima_forecast, start_date, end_date)
        sarima_val = get_forecast_for_period(sarima_forecast, start_date, end_date)
        lstm_val = get_forecast_for_period(lstm_forecast, start_date, end_date)

        forecast_text_parts = []
        if any([arima_val, sarima_val, lstm_val]):
            forecast_text_parts.append("\nForecasts:")
            if arima_val:
                forecast_text_parts.append(f"- ARIMA predicted average {price_col} ≈ {arima_val:.2f}")
            if sarima_val:
                forecast_text_parts.append(f"- SARIMA predicted average {price_col} ≈ {sarima_val:.2f}")
            if lstm_val:
                forecast_text_parts.append(f"- LSTM predicted average {price_col} ≈ {lstm_val:.2f}")
        forecast_text = "\n".join(forecast_text_parts)

        # --- Enhanced textual summary ---
        text = (
            f"Period Summary: {month_name} {year}\n"
            f"Date Range: {start_date} to {end_date}\n"
            f"Year: {year}, Month: {month:02d}\n"
            f"Trend: The {price_col} trend during this period was {trend_desc}.\n"
            f"Average {price_col}: {mean_price:.2f}\n"
            f"Volatility: {vol:.4f} ({vol_level})\n"
            f"Detected anomalies: {anomalies}\n"
            f"{forecast_text}\n\n"
            f"Summary: In {month_name} {year}, the {price_col} showed a {trend_desc} trend with "
            f"{vol_level} volatility (std {vol:.4f}). The mean price was {mean_price:.2f}, and "
            f"{anomalies} anomalies were detected between {start_date} and {end_date}."
        )

        # --- Metadata for retrieval & filtering ---
        meta = {
            "year": int(year),
            "month": int(month),
            "month_name": str(month_name),
            "period_start": str(start_date),
            "period_end": str(end_date),
            "mean_price": float(round(mean_price, 4)),
            "volatility": float(round(vol, 4)),
            "volatility_level": str(vol_level),
            "num_anomalies": int(anomalies),
            "trend": str(trend_desc),
            "forecast_arima": float(arima_val) if arima_val is not None else 0.0,
            "forecast_sarima": float(sarima_val) if sarima_val is not None else 0.0,
            "forecast_lstm": float(lstm_val) if lstm_val is not None else 0.0,
        }


        # --- Optional image metadata if files exist ---
        img_dir = "images"
        for img_name in [
            "trend_seasonality.png", "volatility.png", "forecast_arima.png",
            "forecast_sarima.png", "forecast_lstm.png", "acf_plot.png",
            "pacf_plot.png", "rsi_macd.png"
        ]:
            img_path = os.path.join(img_dir, img_name)
            if os.path.exists(img_path):
                meta[f"image_{img_name.split('.')[0]}"] = img_path

        docs.append(Document(page_content=text, metadata=meta))

    print(f"✅ Created {len(docs)} LangChain summaries (with forecasts & visuals).")
    return docs




In [6]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# ==========================================================
# 1️⃣ ARIMA (with in-sample predictions)
# ==========================================================
def forecast_arima(df: pd.DataFrame, price_col: str, steps: int = 7):
    """
    ARIMA(5,1,0) model producing both in-sample predictions and out-of-sample forecasts.
    """
    df = df.dropna(subset=[price_col]).copy()

    model = ARIMA(df[price_col], order=(5, 1, 0))
    model_fit = model.fit()

    # In-sample predictions
    pred_in_sample = model_fit.predict(start=1, end=len(df))

    # Out-of-sample forecast
    forecast_out = model_fit.forecast(steps=steps)

    # Combine into one DataFrame
    df_pred = pd.DataFrame({
        "forecast_date": df.index,
        "forecast_model": "ARIMA",
        "forecast_type": "in-sample",
        "forecast_value": pred_in_sample.values
    })

    df_forecast = pd.DataFrame({
        "forecast_date": pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=steps),
        "forecast_model": "ARIMA",
        "forecast_type": "out-of-sample",
        "forecast_value": forecast_out.values
    })
    print("[INFO] ARIMA forecast generated.")

    return pd.concat([df_pred, df_forecast], ignore_index=True)


# ==========================================================
# 2️⃣ SARIMA (with in-sample predictions)
# ==========================================================
def forecast_sarima(
    df: pd.DataFrame,
    price_col: str,
    steps: int = 7,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 12)
):
    """
    SARIMA model producing both in-sample predictions and out-of-sample forecasts.
    """
    df = df.dropna(subset=[price_col]).copy()

    model = SARIMAX(df[price_col], order=order, seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)

    # In-sample predictions
    pred_in_sample = model_fit.predict(start=1, end=len(df))

    # Out-of-sample forecast
    forecast_out = model_fit.forecast(steps=steps)

    df_pred = pd.DataFrame({
        "forecast_date": df.index,
        "forecast_model": "SARIMA",
        "forecast_type": "in-sample",
        "forecast_value": pred_in_sample.values
    })

    df_forecast = pd.DataFrame({
        "forecast_date": pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=steps),
        "forecast_model": "SARIMA",
        "forecast_type": "out-of-sample",
        "forecast_value": forecast_out.values
    })
    print("[INFO] SARIMA forecast generated.")

    return pd.concat([df_pred, df_forecast], ignore_index=True)


# ==========================================================
# 3️⃣ LSTM (with in-sample predictions)
# ==========================================================
def forecast_lstm(df: pd.DataFrame, price_col: str, steps: int = 7, epochs: int = 5):
    """
    LSTM model producing both in-sample predictions (reconstructed) and out-of-sample forecasts.
    """
    from numpy import array

    df = df.dropna(subset=[price_col]).copy()

    prices = df[price_col].values.reshape(-1, 1)
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(prices)

    # Prepare supervised data
    X, y = [], []
    for i in range(len(scaled) - 1):
        X.append(scaled[i])
        y.append(scaled[i + 1])
    X, y = np.array(X), np.array(y)
    X = X.reshape((X.shape[0], 1, X.shape[1]))

    # Model
    model = Sequential([
        LSTM(50, activation="relu", input_shape=(1, 1)),
        Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse")
    model.fit(X, y, epochs=epochs, verbose=0)

    # In-sample predictions
    preds_in = model.predict(X, verbose=0)
    preds_in = scaler.inverse_transform(preds_in).flatten()

    df_pred = pd.DataFrame({
        "forecast_date": df.index[1:],  # skip first (no prediction)
        "forecast_model": "LSTM",
        "forecast_type": "in-sample",
        "forecast_value": preds_in
    })

    # Out-of-sample forecasts
    last_val = scaled[-1].reshape((1, 1, 1))
    preds_out = []
    for _ in range(steps):
        next_pred = model.predict(last_val, verbose=0)
        preds_out.append(next_pred[0, 0])
        last_val = next_pred.reshape((1, 1, 1))

    preds_out = scaler.inverse_transform(np.array(preds_out).reshape(-1, 1)).flatten()

    df_forecast = pd.DataFrame({
        "forecast_date": pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=steps),
        "forecast_model": "LSTM",
        "forecast_type": "out-of-sample",
        "forecast_value": preds_out
    })
    print("[INFO] LSTM forecast generated.")

    return pd.concat([df_pred, df_forecast], ignore_index=True)


In [7]:
# -----------------------------------------------------
# 2️⃣ Enhanced Visualization Generator (Metadata-Linked)
# -----------------------------------------------------
import matplotlib.pyplot as plt
import os
from datetime import datetime
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

def generate_timeseries_visuals(
    df,
    price_col="Close",
    arima_forecast=None,
    sarima_forecast=None,
    lstm_forecast=None,
    output_dir="images",
    run_id=None
):
    """
    Generate and save time-series visualizations with filenames synchronized
    to metadata keys used in LangChain summaries (e.g., image_forecast_arima).
    
    Each file is versioned with run_id for uniqueness.
    Returns:
        dict: { "image_forecast_arima": "images/forecast_arima_20250101_123000.png", ... }
    """

    os.makedirs(output_dir, exist_ok=True)

    # -----------------------
    # Unique run suffix
    # -----------------------
    if run_id is None:
        run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    suffix = f"_{run_id}"

    # Helper to save and return full metadata key mapping
    def save_fig(fig, base_name, label=None):
        """
        Saves figure and returns (metadata_key, file_path)
        """
        file_name = f"{base_name}{suffix}.png"
        file_path = os.path.join(output_dir, file_name)
        plt.tight_layout()
        fig.savefig(file_path, dpi=200)
        plt.close(fig)

        metadata_key = f"image_{base_name}" if label is None else f"image_{label}"
        return metadata_key, file_path

    saved_files = {}

    # -----------------------
    # 1️⃣ Trend & Seasonality
    # -----------------------
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(df.index, df[price_col], label="Price", color="black", linewidth=1)
    if "trend" in df.columns:
        ax.plot(df.index, df["trend"], label="Trend", color="blue", linewidth=2)
    if "seasonal" in df.columns:
        ax.plot(df.index, df["seasonal"], label="Seasonality", color="forestgreen", alpha=0.6)
    if "residual" in df.columns:
        ax.plot(df.index, df["residual"], label="Residual", color="orange", linestyle="--", alpha=0.7)
    ax.set_title("Trend, Seasonality and Residual Decomposition")
    ax.legend()
    k, v = save_fig(fig, "trend_seasonality")
    saved_files[k] = v

    # -----------------------
    # 2️⃣ Volatility over Time
    # -----------------------
    vol_cols = [c for c in df.columns if c.startswith("volatility_")]
    if vol_cols:
        fig, ax = plt.subplots(figsize=(10, 4))
        for col in vol_cols:
            ax.plot(df.index, df[col], label=col)
        ax.set_title("Volatility (Rolling Std of Returns)")
        ax.legend()
        k, v = save_fig(fig, "volatility")
        saved_files[k] = v

    # -----------------------
    # 3️⃣ Forecast Plots
    # -----------------------
    if arima_forecast is not None and not arima_forecast.empty:
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.plot(df.index, df[price_col], label="Historical", color="red")
        ax.plot(
            arima_forecast["forecast_date"],
            arima_forecast["forecast_value"],
            label="ARIMA Forecast",
            color="darkgreen"
        )
        ax.set_title("ARIMA Forecast")
        ax.legend()
        k, v = save_fig(fig, "forecast_arima")
        saved_files[k] = v

    if sarima_forecast is not None and not sarima_forecast.empty:
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.plot(df.index, df[price_col], label="Historical", color="navy")
        ax.plot(
            sarima_forecast["forecast_date"],
            sarima_forecast["forecast_value"],
            label="SARIMA Forecast",
            color="crimson"
        )
        ax.set_title("SARIMA Forecast (Seasonal Model)")
        ax.legend()
        k, v = save_fig(fig, "forecast_sarima")
        saved_files[k] = v

    if lstm_forecast is not None and not lstm_forecast.empty:
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.plot(df.index, df[price_col], label="Historical", color="black")
        ax.plot(
            lstm_forecast["forecast_date"],
            lstm_forecast["forecast_value"],
            label="LSTM Forecast",
            color="purple"
        )
        ax.set_title("LSTM Forecast")
        ax.legend()
        k, v = save_fig(fig, "forecast_lstm")
        saved_files[k] = v

    # -----------------------
    # 4️⃣ ACF & PACF
    # -----------------------
    if len(df) > 20:
        fig, ax = plt.subplots(figsize=(8, 4))
        plot_acf(df[price_col].dropna(), ax=ax, lags=30)
        ax.set_title("Autocorrelation Function (ACF)")
        k, v = save_fig(fig, "acf_plot")
        saved_files[k] = v

        fig, ax = plt.subplots(figsize=(8, 4))
        plot_pacf(df[price_col].dropna(), ax=ax, lags=30)
        ax.set_title("Partial Autocorrelation Function (PACF)")
        k, v = save_fig(fig, "pacf_plot")
        saved_files[k] = v

    # -----------------------
    # 5️⃣ RSI & MACD
    # -----------------------
    if "rsi" in df.columns or "macd" in df.columns:
        fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
        if "rsi" in df.columns:
            axes[0].plot(df.index, df["rsi"], color="teal")
            axes[0].set_title("Relative Strength Index (RSI)")
            axes[0].axhline(70, color="red", linestyle="--", alpha=0.7)
            axes[0].axhline(30, color="green", linestyle="--", alpha=0.7)
        if "macd" in df.columns and "macd_signal" in df.columns:
            axes[1].plot(df.index, df["macd"], label="MACD", color="purple")
            axes[1].plot(df.index, df["macd_signal"], label="Signal", color="orange")
            axes[1].set_title("MACD Indicator")
            axes[1].legend()
        k, v = save_fig(fig, "rsi_macd")
        saved_files[k] = v

    print(f"✅ Saved all analysis visualizations (run_id={run_id}) to: {output_dir}")
    return saved_files



In [8]:
def slice_dataframe(df, slice_range=None):
    """
    Dynamically slice the dataframe if a range is provided.
    Example: slice_range=(750, None) or slice_range=(None, 500)
    """
    if slice_range is not None:
        start, end = slice_range
        return df[start:end]
    return df

In [9]:
# =====================================================
# MAIN ANALYSIS PIPELINE (ENHANCED)
# =====================================================

# Imports
import pandas as pd
from pathlib import Path

# Assuming all previous helper functions are imported:
# load_data, precompute_basic_features, compute_trend_seasonality, detect_anomalies
# forecast_arima, forecast_sarima, forecast_lstm, create_langchain_summaries, generate_timeseries_visuals

# -----------------------------------------------------
# 1️⃣ Load & Prepare Data
# -----------------------------------------------------
col = "Close"
data_path = "../data/goog_stock_data.csv"
df = load_data(data_path, col=col)

print("✅ Data loaded. Index preview:", df.index[:3])

df = precompute_basic_features(df, col)
df = compute_trend_seasonality(df, col)
df = detect_anomalies(df, col)

# -----------------------------------------------------
# 2️⃣ Forecasts (ARIMA, SARIMA, LSTM)
# -----------------------------------------------------
sliced_df = slice_dataframe(df, slice_range=(750, None))
print("⏳ Running forecasts...")
arima_forecast = forecast_arima(sliced_df, col, steps=500)
sarima_forecast = forecast_sarima(sliced_df, col, steps=500, order=(2,1,10), seasonal_order=(1,1,1,12))
lstm_forecast = forecast_lstm(sliced_df, col, epochs=15, steps=500)

# Merge forecasts (optional consolidated view)
df_forecasts = (
    arima_forecast[["forecast_date", "forecast_value"]]
    .rename(columns={"forecast_value": "forecast_arima"})
    .merge(
        sarima_forecast[["forecast_date", "forecast_value"]].rename(columns={"forecast_value": "forecast_sarima"}),
        on="forecast_date",
        how="outer"
    )
    .merge(
        lstm_forecast[["forecast_date", "forecast_value"]].rename(columns={"forecast_value": "forecast_lstm"}),
        on="forecast_date",
        how="outer"
    )
)
print("✅ Forecasts merged. Preview:")
print(df_forecasts.head())

✅ Data loaded. Index preview: DatetimeIndex(['2020-10-26 04:00:00+00:00', '2020-10-27 04:00:00+00:00',
               '2020-10-28 04:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='Date', freq=None)
⏳ Running forecasts...
[INFO] ARIMA forecast generated.
[INFO] SARIMA forecast generated.
[INFO] LSTM forecast generated.
✅ Forecasts merged. Preview:
              forecast_date  forecast_arima  forecast_sarima  forecast_lstm
0 2023-10-19 04:00:00+00:00      138.035369       138.035330            NaN
1 2023-10-20 04:00:00+00:00      135.834679       135.810699     161.454224
2 2023-10-23 04:00:00+00:00      136.888480       136.962572     160.607162
3 2023-10-24 04:00:00+00:00      139.340873       139.167528     161.045227
4 2023-10-25 04:00:00+00:00      125.990361       125.808958     161.887131


In [10]:
# -----------------------------------------------------
# 3️⃣ Generate and Save Visuals
# -----------------------------------------------------
run_id = "002"
output_dir = Path(f"visualizations/{run_id}")
output_dir.mkdir(parents=True, exist_ok=True)

visuals = generate_timeseries_visuals(
    df,
    price_col=col,
    arima_forecast=arima_forecast,
    sarima_forecast=sarima_forecast,
    lstm_forecast=lstm_forecast,
    output_dir=str(output_dir),
    run_id=run_id
)

# -----------------------------------------------------
# 4️⃣ Create LangChain Summaries (auto-links visuals)
# -----------------------------------------------------
docs = create_langchain_summaries(df, price_col=col, freq="M")
for d in docs:
    d.metadata.update(visuals)

print("✅ Example Document Summary:")
print("─────────────────────────────")
print(docs[-1].page_content)
print("─────────────────────────────")
print("Metadata:", docs[-1].metadata)

# -----------------------------------------------------
# 5️⃣ Optional: Save intermediate data
# -----------------------------------------------------
merged_output = Path("../data/processed_forecasts.csv")
df_forecasts.to_csv(merged_output, index=False)
print(f"📁 Saved forecast results to: {merged_output.resolve()}")

print("🎯 Pipeline complete. Ready for embedding or query stage.")


✅ Saved all analysis visualizations (run_id=002) to: visualizations\002
✅ Created 61 LangChain summaries (with forecasts & visuals).
✅ Example Document Summary:
─────────────────────────────
Period Summary: October 2025
Date Range: 2025-10-01 to 2025-10-24
Year: 2025, Month: 10
Trend: The Close trend during this period was increasing.
Average Close: 249.20
Volatility: 0.0154 (moderate)
Detected anomalies: 10


Summary: In October 2025, the Close showed a increasing trend with moderate volatility (std 0.0154). The mean price was 249.20, and 10 anomalies were detected between 2025-10-01 and 2025-10-24.
─────────────────────────────
Metadata: {'year': 2025, 'month': 10, 'month_name': 'October', 'period_start': '2025-10-01', 'period_end': '2025-10-24', 'mean_price': 249.1978, 'volatility': 0.0154, 'volatility_level': 'moderate', 'num_anomalies': 10, 'trend': 'increasing', 'forecast_arima': 0.0, 'forecast_sarima': 0.0, 'forecast_lstm': 0.0, 'image_trend_seasonality': 'visualizations\\002\\t

In [11]:
# ==============================================
# vector_pipeline.py
# ==============================================

from typing import List, Union
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions

# -------------------------------------------------
# 1️⃣ Embedding Engine
# -------------------------------------------------
class EmbeddingEngine:
    """
    A wrapper around SentenceTransformer for encoding LangChain Documents.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str = "cpu"):
        self.model_name = model_name
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)
        print(f"✅ Loaded embedding model: {model_name} on {device}")

    def embed_documents(self, docs: List[Document]) -> np.ndarray:
        """
        Encodes a list of LangChain Document objects into dense vectors.
        """
        texts = [doc.page_content for doc in docs]
        embeddings = self.model.encode(texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True)
        return embeddings

    def embed_query(self, query: str) -> np.ndarray:
        """
        Encodes a single query string into a dense vector.
        """
        return self.model.encode([query], convert_to_numpy=True)[0]


# -------------------------------------------------
# 2️⃣ Vector Store (ChromaDB)
# -------------------------------------------------
class VectorStore:
    """
    A persistent vector store using ChromaDB.
    """

    def __init__(self, persist_dir: str = "./chroma_store", collection_name: str = "timeseries_docs", model_name: str = "all-MiniLM-L6-v2"):
        self.client = chromadb.PersistentClient(path=persist_dir)
        self.model_name = model_name
        self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=self.embedding_fn
        )
        print(f"✅ Connected to ChromaDB collection: {collection_name}")

    def add_documents(self, docs: List[Document], embeddings: np.ndarray):
        """
        Adds documents + embeddings to Chroma collection.
        """
        ids = [f"doc_{i}" for i in range(len(docs))]
        metadatas = [doc.metadata for doc in docs]
        texts = [doc.page_content for doc in docs]
        self.collection.add(documents=texts, embeddings=embeddings.tolist(), metadatas=metadatas, ids=ids)
        print(f"📚 Added {len(docs)} documents to ChromaDB")

    def query(self, query_embedding: np.ndarray, top_k: int = 3):
        """
        Retrieves top-k similar documents given a query embedding.
        """
        results = self.collection.query(query_embeddings=[query_embedding.tolist()], n_results=top_k)
        hits = [
            {"text": results["documents"][0][i], "score": results["distances"][0][i], "metadata": results["metadatas"][0][i]}
            for i in range(len(results["documents"][0]))
        ]
        return hits

    def clear(self):
        """Clears all documents in the ChromaDB collection safely."""
        all_ids = self.collection.get()["ids"]
        if all_ids:
            self.collection.delete(ids=all_ids)
            print(f"🧹 Cleared {len(all_ids)} documents from ChromaDB collection.")
        else:
            print("🧹 Collection already empty.")


# -------------------------------------------------
# 3️⃣ Example Pipeline Usage
# -------------------------------------------------
# if __name__ == "__main__":
#     # Assuming 'docs' is your list of LangChain Documents
#     from my_pipeline import docs   # Replace this with your own import

# Step 1: Embed
embedder = EmbeddingEngine(model_name="all-MiniLM-L6-v2")
doc_embeddings = embedder.embed_documents(docs)

# Step 2: Store in Chroma
store = VectorStore(persist_dir="./chroma_store", collection_name="goog_timeseries")
store.clear()
store.add_documents(docs, doc_embeddings)

    # Step 3: Query
user_query = "Explain the volatility trend for March 2024"
query_vec = embedder.embed_query(user_query)
results = store.query(query_vec, top_k=3)

for r in results:
    print("\n🧩 Match:")
    print("Text:", r["text"])
    print("Score:", round(r["score"], 4))
    print("Metadata:", r["metadata"])



✅ Loaded embedding model: all-MiniLM-L6-v2 on cpu


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.84it/s]


✅ Connected to ChromaDB collection: goog_timeseries
🧹 Cleared 61 documents from ChromaDB collection.
📚 Added 61 documents to ChromaDB

🧩 Match:
Text: Period Summary: March 2025
Date Range: 2025-03-03 to 2025-03-31
Year: 2025, Month: 03
Trend: The Close trend during this period was decreasing.
Average Close: 166.96
Volatility: 0.0223 (moderate)
Detected anomalies: 0


Summary: In March 2025, the Close showed a decreasing trend with moderate volatility (std 0.0223). The mean price was 166.96, and 0 anomalies were detected between 2025-03-03 and 2025-03-31.
Score: 0.2475
Metadata: {'month_name': 'March', 'image_forecast_sarima': 'visualizations\\002\\forecast_sarima_002.png', 'period_start': '2025-03-03', 'period_end': '2025-03-31', 'forecast_arima': 0.0, 'forecast_lstm': 0.0, 'image_trend_seasonality': 'visualizations\\002\\trend_seasonality_002.png', 'month': 3, 'image_pacf_plot': 'visualizations\\002\\pacf_plot_002.png', 'trend': 'decreasing', 'forecast_sarima': 0.0, 'year': 2025, 'vol

In [12]:
"""
chat_groq_client.py

A lightweight, task-specific class to interact with ChatGroq Llama-3.3-70B-Versatile.
It takes a context (retrieved from Chroma) and a user query, builds a clear prompt,
and returns the model's answer.

Usage:
    from chat_groq_client import ChatGroqClient

    llm = ChatGroqClient(api_key="YOUR_GROQ_API_KEY")
    answer = llm.ask(context_str, user_query)
    print(answer)
"""

import os
from typing import Optional
from langchain_groq.chat_models import ChatGroq


class ChatGroqClient:
    """
    Minimal wrapper for Groq's Llama-3.3-70B-Versatile model via LangChain.
    """

    def __init__(
        self,
        api_key: Optional[str] = None,
        model: str = "llama-3.3-70b-versatile",
        temperature: float = 0.2,
        max_tokens: int = 512,
    ):
        self.api_key = api_key or os.getenv("GROQ_API_KEY")
        if not self.api_key:
            raise ValueError("❌ GROQ_API_KEY is not set. Provide it via constructor or environment variable.")
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        # Initialize Groq model through LangChain wrapper
        self.llm = ChatGroq(
            api_key=self.api_key,
            model=self.model,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
        )

    def build_prompt(self, context: str, query: str) -> str:
        """
        Builds a focused prompt using retrieved context and user query.
        """
        instruction = (
            "You are a financial analysis assistant. "
            "Use the provided context (stock trends, volatility, and anomalies) "
            "to answer the user query accurately and concisely."
        )
        return f"{instruction}\n\nCONTEXT:\n{context}\n\nQUESTION:\n{query}\n\nAnswer clearly."

    def ask(self, context: str, query: str) -> str:
        """
        Sends the prompt to Groq and returns the model's reply.
        """
        prompt = self.build_prompt(context, query)
        response = self.llm.invoke(prompt)
        # Depending on LangChain version, ChatGroq may return a dict or AIMessage
        if hasattr(response, "content"):
            return response.content
        if isinstance(response, dict):
            return response.get("content") or str(response)
        return str(response)


In [13]:
queries = [
    "Explain the volatility trend for March 2024",
    "What was the average closing price in 2021?",
    "Did the SARIMA model detect any seasonal shifts in 2024?",
    "Compare LSTM and ARIMA forecasts for early 2025"
]
groq_llm = ChatGroqClient()

for query in queries:
    query_vec = embedder.embed_query(query)
    results = store.query(query_vec, top_k=3)
    # Combine retrieved texts into a single context string
    context = "\n---\n".join([r["text"] for r in results])
    answer = groq_llm.ask(context, query)
    print(f"\n💬 User Query: {query}")
    print("💬 LLM Answer:")
    print(answer)


💬 User Query: Explain the volatility trend for March 2024
💬 LLM Answer:
The volatility trend for March 2024 was moderate, with a standard deviation of 0.0163. This indicates a relatively stable period with moderate price fluctuations.

💬 User Query: What was the average closing price in 2021?
💬 LLM Answer:
To calculate the average closing price for the entire year of 2021 based on the provided monthly averages, we first need to calculate the total sum of the monthly averages and then divide by the number of months provided.

Given averages:
- October 2021: 139.49
- November 2021: 146.43
- December 2021: 144.66

Total sum = 139.49 + 146.43 + 144.66 = 430.58

Number of months provided = 3

Average closing price for the given months = Total sum / Number of months = 430.58 / 3 = 143.53

However, since the question asks for the average closing price in 2021 and we only have data for October, November, and December, we cannot accurately calculate the average for the entire year without data

In [14]:
# from chat_groq_client import ChatGroqClient

# # --- Step 1: Retrieve relevant context from Chroma ---
# user_query = "Explain the volatility trend for March 2024"
# query_vec = embedder.embed_query(user_query)
# results = store.query(query_vec, top_k=3)

# # Combine retrieved texts into a single context string
# context = "\n---\n".join([r["text"] for r in results])

# # --- Step 2: Use ChatGroq to generate an answer ---
# groq_llm = ChatGroqClient()  # or set GROQ_API_KEY in env
# answer = groq_llm.ask(context, user_query)

# print("\n💬 LLM Answer:")
# print(answer)


In [18]:

def chat_query(query: str, groq_llm: ChatGroqClient):
    # groq_llm = ChatGroqClient()
    # for query in queries:
        embedder = EmbeddingEngine(model_name="all-MiniLM-L6-v2")
        query_vec = embedder.embed_query(query)
        store = VectorStore(persist_dir="./chroma_store", collection_name="goog_timeseries")
        results = store.query(query_vec, top_k=50)
        # Combine retrieved texts into a single context string
        context = "\n---\n".join([r["text"] for r in results])
        answer = groq_llm.ask(context, query)
        print(f"\n💬 User Query: {query}")
        print(f"\n💬 Context: {results}")
        print("💬 LLM Answer:")
        print(answer)
        return answer
    
answer = chat_query(query="Compare LSTM and ARIMA forecasts for early 2025", groq_llm = ChatGroqClient())
answer

✅ Loaded embedding model: all-MiniLM-L6-v2 on cpu
✅ Connected to ChromaDB collection: goog_timeseries

💬 User Query: Compare LSTM and ARIMA forecasts for early 2025

💬 Context: [{'text': 'Period Summary: February 2025\nDate Range: 2025-02-03 to 2025-02-28\nYear: 2025, Month: 02\nTrend: The Close trend during this period was decreasing.\nAverage Close: 185.52\nVolatility: 0.0208 (moderate)\nDetected anomalies: 0\n\n\nSummary: In February 2025, the Close showed a decreasing trend with moderate volatility (std 0.0208). The mean price was 185.52, and 0 anomalies were detected between 2025-02-03 and 2025-02-28.', 'score': 0.6455801725387573, 'metadata': {'year': 2025, 'month': 2, 'image_rsi_macd': 'visualizations\\002\\rsi_macd_002.png', 'image_pacf_plot': 'visualizations\\002\\pacf_plot_002.png', 'image_acf_plot': 'visualizations\\002\\acf_plot_002.png', 'num_anomalies': 0, 'image_forecast_arima': 'visualizations\\002\\forecast_arima_002.png', 'forecast_arima': 0.0, 'mean_price': 185.525, 

"To compare LSTM and ARIMA forecasts for early 2025, we need to analyze the provided context. \n\nThe context provides period summaries for various months from 2020 to 2025, including trends, average closes, volatility, and detected anomalies. \n\nFor early 2025, we have the following period summaries:\n- January 2025: decreasing trend, average close of 196.20, moderate volatility (0.0167), and 0 anomalies.\n- February 2025: decreasing trend, average close of 185.52, moderate volatility (0.0208), and 0 anomalies.\n- March 2025: decreasing trend, average close of 166.96, moderate volatility (0.0223), and 0 anomalies.\n- April 2025: increasing trend, average close of 156.38, high volatility (0.0306), and 0 anomalies.\n- May 2025: increasing trend, average close of 165.41, moderate volatility (0.0236), and 0 anomalies.\n\nLSTM (Long Short-Term Memory) and ARIMA (AutoRegressive Integrated Moving Average) are both forecasting models. \n\nLSTM is a type of Recurrent Neural Network (RNN) that

In [16]:
import pandas as pd

df = pd.read_csv("../data/goog_stock_data.csv")
df["Date"] = pd.to_datetime(df["Date"], utc=True)
df = df.set_index("Date").sort_index()
print(df.head())
print(df.columns)
print(df.dtypes)

                                Open       High        Low      Close  \
Date                                                                    
2020-10-26 04:00:00+00:00  80.698262  81.355265  78.289241  78.982002   
2020-10-27 04:00:00+00:00  79.241239  79.796187  78.601118  79.667816   
2020-10-28 04:00:00+00:00  77.456942  77.536893  75.216279  75.315598   
2020-10-29 04:00:00+00:00  75.600633  79.143890  75.594677  77.829384   
2020-10-30 04:00:00+00:00  83.037261  83.776700  79.677751  80.499626   

                             Volume  Dividends  Stock Splits  
Date                                                          
2020-10-26 04:00:00+00:00  37066000        0.0           0.0  
2020-10-27 04:00:00+00:00  24580000        0.0           0.0  
2020-10-28 04:00:00+00:00  36680000        0.0           0.0  
2020-10-29 04:00:00+00:00  40062000        0.0           0.0  
2020-10-30 04:00:00+00:00  86582000        0.0           0.0  
Index(['Open', 'High', 'Low', 'Close', 'Volume'

In [17]:
print(df.index)

DatetimeIndex(['2020-10-26 04:00:00+00:00', '2020-10-27 04:00:00+00:00',
               '2020-10-28 04:00:00+00:00', '2020-10-29 04:00:00+00:00',
               '2020-10-30 04:00:00+00:00', '2020-11-02 05:00:00+00:00',
               '2020-11-03 05:00:00+00:00', '2020-11-04 05:00:00+00:00',
               '2020-11-05 05:00:00+00:00', '2020-11-06 05:00:00+00:00',
               ...
               '2025-10-13 04:00:00+00:00', '2025-10-14 04:00:00+00:00',
               '2025-10-15 04:00:00+00:00', '2025-10-16 04:00:00+00:00',
               '2025-10-17 04:00:00+00:00', '2025-10-20 04:00:00+00:00',
               '2025-10-21 04:00:00+00:00', '2025-10-22 04:00:00+00:00',
               '2025-10-23 04:00:00+00:00', '2025-10-24 04:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='Date', length=1256, freq=None)
