In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

# Load preprocessed data from Task 1
returns = pd.read_csv('../data/processed/returns.csv', index_col=0, parse_dates=True)
tsla_data = pd.read_csv('../data/processed/tsla_prices.csv', index_col=0, parse_dates=True)['TSLA']

# Split data chronologically
train_size = int(len(tsla_data) * 0.8)
train_data, test_data = tsla_data[:train_size], tsla_data[train_size:]
print(f"Training data length: {len(train_data)}")
print(f"Testing data length: {len(test_data)}")

# --- ARIMA Model Implementation ---
print("\n--- Training ARIMA Model ---")
# Use auto_arima to find optimal parameters
# This step can take a while, so we'll set a smaller test range
model_arima_auto = pm.auto_arima(train_data, seasonal=False, suppress_warnings=True, stepwise=True, trace=True)
print(f"Optimal ARIMA parameters: {model_arima_auto.order}")

# Use the found parameters to train the final model
p, d, q = model_arima_auto.order
model_arima = ARIMA(train_data, order=(p, d, q))
model_arima_fit = model_arima.fit()
print(model_arima_fit.summary())

# Forecast
arima_forecast = model_arima_fit.forecast(steps=len(test_data))

# --- LSTM Model Implementation ---
print("\n--- Training LSTM Model ---")
# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(tsla_data.values.reshape(-1, 1))

# Create sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 60 # Using 60 days as sequence length
X, y = create_sequences(scaled_data, seq_length)

# Split data
X_train, X_test = X[:int(len(X) * 0.8)], X[int(len(X) * 0.8):]
y_train, y_test = y[:int(len(y) * 0.8)], y[int(len(y) * 0.8):]

# Reshape for LSTM [samples, time steps, features]
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build LSTM model
model_lstm = Sequential([
    LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
    LSTM(50, return_sequences=False),
    Dense(25),
    Dense(1)
])
model_lstm.compile(optimizer='adam', loss='mean_squared_error')
model_lstm.fit(X_train, y_train, batch_size=32, epochs=50)

# Forecast with LSTM
lstm_predictions = []
current_batch = X_test[0].reshape(1, seq_length, 1)

for i in range(len(test_data)):
    prediction = model_lstm.predict(current_batch)[0]
    lstm_predictions.append(prediction)
    current_batch = np.append(current_batch[:, 1:, :], [[prediction]], axis=1)

# Inverse scale the predictions
lstm_predictions = scaler.inverse_transform(np.array(lstm_predictions).reshape(-1, 1))
lstm_predictions = pd.Series(lstm_predictions.flatten(), index=test_data.index)


# --- Model Evaluation ---
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print(f"\n--- {model_name} Evaluation ---")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAPE: {mape:.2f}%")
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape}

# Evaluate ARIMA
arima_metrics = evaluate_model(test_data, arima_forecast, "ARIMA")

# Evaluate LSTM
lstm_metrics = evaluate_model(test_data.values, lstm_predictions.values, "LSTM")

# Compare and visualize
plt.figure(figsize=(15, 7))
plt.plot(test_data, label='Actual Prices')
plt.plot(arima_forecast, label='ARIMA Forecast', alpha=0.7)
plt.plot(lstm_predictions, label='LSTM Forecast', alpha=0.7)
plt.title('ARIMA vs LSTM Forecasts on Test Data')
plt.xlabel('Date')
plt.ylabel('TSLA Price')
plt.legend()
plt.show()

# Discussion:
# ARIMA works well for stationary data and captures linear dependencies, but might struggle with
# non-linear trends and complex patterns. LSTM, a deep learning model, is capable of learning
# long-term dependencies and non-linear patterns in data, often leading to better performance
# on complex financial time series, although it requires more data and computational resources.
# The choice depends on the specific dataset and the balance between accuracy and interpretability.