# Task 2: Build Time Series Forecasting Models

## Objective
Develop, train, and evaluate time series forecasting models to predict Tesla's future stock prices.
We will implement ARIMA and LSTM models.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../src')))

from models import split_data, ARIMAModel, LSTMModel, evaluate_forecast
from data_loader import clean_data # Assuming we might reload raw if needed, or load processed csv

%matplotlib inline

## 1. Load Data

In [None]:
try:
    data = pd.read_csv('../data/processed/historical_data.csv', index_col=0, parse_dates=True, header=[0, 1])
    # If header is multi-index, we need to handle it. If fetched via group_by='ticker', top level is ticker.
    # Note: data_loader.py saved it. If it was not run successfully, this will fail.
except FileNotFoundError:
    print("Data file not found. Please ensure Task 1 data extraction was successful.")
    # Fallback for demonstration if file doesn't exist (simulated): 
    # data = simulation stuff...

# Assuming we focus on TSLA Close Price
ticker = 'TSLA'
# Depending on how CSV was saved (MultiIndex or Flat), we extract data.
# Let's assume valid extraction for now:
try:
    if 'Close' in data.columns.get_level_values(1):
         tsla_close = data[ticker]['Close']
    else:
         # Check if single level
         tsla_close = data['Close'] # If only TSLA was fetched or different format
except:
    # Handle if loading fails or structure is different
    print("Adjusting data loading logic...")
    # Mock data for structure if file is missing (to allow code viewing)
    dates = pd.date_range(start='2015-01-01', periods=2000)
    tsla_close = pd.Series(np.random.randn(2000).cumsum() + 100, index=dates, name='TSLA')

## 2. Train/Test Split

In [None]:
train_data, test_data = split_data(tsla_close)
print(f"Train size: {len(train_data)}, Test size: {len(test_data)}")
plt.figure(figsize=(12, 6))
plt.plot(train_data, label='Train')
plt.plot(test_data, label='Test')
plt.legend()
plt.show()

## 3. ARIMA Model

In [None]:
arima_model = ARIMAModel()
# Improve: Use 'fit' with order if known, or 'optimize_and_fit' to search
best_order = arima_model.optimize_and_fit(train_data)
print("Best Order:", best_order)

# Forecast
forecast_arima = arima_model.predict(n_periods=len(test_data))
forecast_arima = pd.Series(forecast_arima, index=test_data.index)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(test_data, label='Actual')
plt.plot(forecast_arima, label='ARIMA Forecast')
plt.legend()
plt.title("ARIMA Forecast vs Actual")
plt.show()

## 4. LSTM Model

In [None]:
lstm_model = LSTMModel(look_back=60, epochs=10) # 10 epochs for speed demo
lstm_model.fit(train_data)

# Prepare test input: We need last 60 days of train + test data to predict test period
# Concatenate last 60 of train + test
combined_data = pd.concat([train_data[-60:], test_data])
predictions_lstm = lstm_model.predict(combined_data.values)

# Align indices (The predict output depends on how create_dataset handles edges)
# Our predict implementation processes the whole chunk passed and returns valid outputs
# If we passed (60 + len(test)), we get len(test) predictions
forecast_lstm = pd.Series(predictions_lstm, index=test_data.index)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(test_data, label='Actual')
plt.plot(forecast_lstm, label='LSTM Forecast')
plt.legend()
plt.title("LSTM Forecast vs Actual")
plt.show()

## 5. Evaluation & Comparison

In [None]:
arima_metrics = evaluate_forecast(test_data, forecast_arima, "ARIMA")
lstm_metrics = evaluate_forecast(test_data, forecast_lstm, "LSTM")

comparison = pd.DataFrame([arima_metrics, lstm_metrics])
print(comparison)