### Importing the required modules

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from alpha_vantage.timeseries import TimeSeries
from alpha_vantage.foreignexchange import ForeignExchange
from alpha_vantage.commodities import Commodities
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from xgboost import XGBRegressor
import os

### Alpha Vantage Setup (Trading API)

In [2]:
# api_key = 'GLVZ9GJN4IW7GRUB'
api_key = 'K757OWEW19L34ML9'
# symbols = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']  # Multiple stocks for analysis
ts = TimeSeries(key=api_key, output_format='pandas')
fx = ForeignExchange(key=api_key, output_format='pandas')

assets = [
    {'symbol': 'AAPL', 'type': 'stock'},
    {'symbol': 'MSFT', 'type': 'stock'},
    {'symbol': 'TSLA', 'type': 'stock'},
    {'symbol': 'AMZN', 'type': 'stock'},
    {'symbol': 'EURUSD', 'type': 'forex', 'from_symbol': 'EUR', 'to_symbol': 'USD'},
    {'symbol': 'USDJPY', 'type': 'forex', 'from_symbol': 'USD', 'to_symbol': 'JPY'},
    # {'symbol': 'XAUUSD', 'type': 'commodity'},  # Gold
    # {'symbol': 'XAGUSD', 'type': 'commodity'},  # Silver
    {'symbol': 'COPPER', 'type': 'commodity', 'api_method': 'get_copper'},
    {'symbol': 'NATURAL_GAS', 'type': 'commodity', 'api_method': 'get_natural_gas'}
]

### Global Variables

In [3]:
results = {}
stock_stats = []

### Data Preparation

In [4]:
def prepare_data(data):
    columns = data.columns

    # Case 1: Stock or forex style
    if '1. open' in columns:
        data = data.rename(columns={
            '1. open': 'open',
            '2. high': 'high',
            '3. low': 'low',
            '4. close': 'close',
            '5. volume': 'volume'
        })[['open', 'high', 'low', 'close', 'volume']]

    # Case 2: Commodity style (likely just a 'value' column)
    elif 'value' in columns:
        data = data.rename(columns={'value': 'close'})
        data['open'] = data['close']
        data['high'] = data['close']
        data['low'] = data['close']
        data['volume'] = 0
        data = data[['open', 'high', 'low', 'close', 'volume']]

    else:
        raise ValueError("Unknown data format: cannot parse columns:", list(columns))

    return data[::-1].reset_index(drop=True)  # sort oldest to newest


### Load the locally stored data or Fetch the data using the API
Better to load the data as the API has a limit on the number of requests per day. 

In [5]:
def load_or_fetch_data(asset, api_key, directory="asset_data"):
    import os
    from alpha_vantage.timeseries import TimeSeries
    from alpha_vantage.foreignexchange import ForeignExchange
    from alpha_vantage.commodities import Commodities

    os.makedirs(directory, exist_ok=True)
    symbol = asset['symbol']
    filepath = os.path.join(directory, f"{symbol}.csv")

    if os.path.exists(filepath):
        print(f"Loading cached data for {symbol}")
        df = pd.read_csv(filepath)
        df = prepare_data(df)
    else:
        print(f"Fetching data for {symbol} from Alpha Vantage...")

        if asset['type'] == 'stock':
            ts = TimeSeries(key=api_key, output_format='pandas')
            df, _ = ts.get_daily(symbol=symbol, outputsize='full')

        elif asset['type'] == 'forex':
            fx = ForeignExchange(key=api_key, output_format='pandas')
            df, _ = fx.get_currency_exchange_daily(
                from_symbol=asset['from_symbol'],
                to_symbol=asset['to_symbol'],
                outputsize='full'
            )
            df['5. volume'] = 0

        elif asset['type'] == 'commodity':
            cm = Commodities(key=api_key, output_format='pandas')
            # Dynamically call the correct commodity method
            method_name = asset['api_method']  # e.g., 'get_copper', 'get_gold'
            if hasattr(cm, method_name):
                func = getattr(cm, method_name)
                df, _ = func(interval='monthly')
                df['5. volume'] = 0
            else:
                raise ValueError(f"Commodities API has no method: {method_name}")

        df.to_csv(filepath)
        print(f"Saved {symbol} data to {filepath}")
        df = prepare_data(df)

    return df


In [20]:
# data = load_or_fetch_data(symbols[4], api_key)
data_dict = {}
for asset in assets:
    df = load_or_fetch_data(asset, api_key)
    data_dict[asset['symbol']] = df


print(data_dict[assets[6]['symbol']])
df.head()  # Display the first few rows of the data

Loading cached data for AAPL
Loading cached data for MSFT
Loading cached data for TSLA
Loading cached data for AMZN
Loading cached data for EURUSD
Loading cached data for USDJPY
Loading cached data for COPPER
Loading cached data for NATURAL_GAS
                 open              high               low             close  \
0                   .                 .                 .                 .   
1                   .                 .                 .                 .   
2                   .                 .                 .                 .   
3                   .                 .                 .                 .   
4                   .                 .                 .                 .   
..                ...               ...               ...               ...   
541          9330.975          9330.975          9330.975          9330.975   
542  9735.82333333334  9735.82333333334  9735.82333333334  9735.82333333334   
543  9172.69590909091  9172.69590909091  917

Unnamed: 0,open,high,low,close,volume
0,3.45,3.45,3.45,3.45,0
1,2.15,2.15,2.15,2.15,0
2,1.89,1.89,1.89,1.89,0
3,2.03,2.03,2.03,2.03,0
4,2.25,2.25,2.25,2.25,0


### Technical indicators for XGBoost

In [7]:
def add_technical_indicators(df):
    df['MA5'] = df['close'].rolling(window=5).mean()
    df['MA10'] = df['close'].rolling(window=10).mean()
    df['MA20'] = df['close'].rolling(window=20).mean()
    df['Return_5'] = df['close'].pct_change(periods=5)
    df['Volatility_20'] = df['close'].rolling(window=20).std()
    df['RSI'] = compute_rsi(df['close'], 14)
    df['MACD'] = compute_macd(df['close'])
    df = df.dropna()
    return df

def compute_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def compute_macd(series, slow=26, fast=12):
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    return ema_fast - ema_slow

### Visualize which features were selected and how important they are

In [8]:
def plot_xgboost_feature_importance(model, feature_names, symbol):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(8, 5))
    plt.bar(range(len(feature_names)), importances[indices], align='center')
    plt.xticks(range(len(feature_names)), [feature_names[i] for i in indices], rotation=45)
    plt.title(f"{symbol} - XGBoost Feature Importances")
    plt.xlabel("Features")
    plt.ylabel("Importance Score")
    plt.tight_layout()
    plt.show()

### XGBoost for Feature Selection

In [9]:
def select_features(df, target_col='close', symbol=""):
    df = df.select_dtypes(include=[np.number])

    # target: future return (use percent change)
    y = df[target_col].pct_change().shift(-1).dropna()
    df = df.iloc[:-1].reset_index(drop=True)  # Align features to y

    X = df.drop(columns=[target_col])
    model = XGBRegressor()
    model.fit(X, y)

    plot_xgboost_feature_importance(model, X.columns, symbol)

    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Feature').reset_index(drop=True)

    print("\nFull Feature Importance Ranking:")
    print(feature_importance_df)

    top_features = X.columns[np.argsort(model.feature_importances_)][-5:]
    X_selected = X[top_features].copy().reset_index(drop=True)
    y_target = df[target_col].iloc[1:].reset_index(drop=True)  # align with prediction target

    y_target.name = target_col  # ✅ set the name to 'Close' (important for merging)

    return X_selected, y_target


### LSTM Forecasting (for multivariate time series)

In [11]:
def lstm_forecast_multivariate(data, target='close'):
    assert isinstance(target, str), "Target must be a string"
    assert target in data.columns, "Target column not found in input data"

    # Separate features and target
    feature_cols = [col for col in data.columns if col != target]
    full_data = data[feature_cols + [target]]  # Ensures correct column order

    # Scale data
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(full_data)
    target_index = full_data.columns.get_loc(target)

    # Create LSTM input windows
    X_all, y_all = [], []
    for i in range(60, len(scaled)):
        X_all.append(scaled[i-60:i, :-1])  # input: all features except target
        y_all.append(scaled[i, target_index])  # output: target

    X_all, y_all = np.array(X_all), np.array(y_all)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, shuffle=False)

    # LSTM model
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=(X_all.shape[1], X_all.shape[2])),
        LSTM(32),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # Predict
    pred_scaled = model.predict(X_test)
    y_test = y_test.reshape(-1, 1)

    # Only reverse scale the target column
    pred_full = np.zeros((len(pred_scaled), full_data.shape[1]))
    actual_full = np.zeros_like(pred_full)
    pred_full[:, target_index] = pred_scaled[:, 0]
    actual_full[:, target_index] = y_test[:, 0]

    pred = scaler.inverse_transform(pred_full)[:, target_index]
    actual = scaler.inverse_transform(actual_full)[:, target_index]

    return actual, pred


### Single Symbol Processing - Test

In [15]:
# symbol = symbols[4]  # Use the first symbol for initial processing
# print(f"\n=== Processing: {symbol} ===")
# # data, _ = ts.get_daily(symbol=symbol, outputsize='full')
# # data = prepare_data(data)
# data = load_or_fetch_data(symbol, api_key)


# Choose asset by symbol or index
symbol = assets[6]['symbol']  # Pick from structured list
print(f"\n=== Processing: {symbol} ===")

# Get preloaded data from dictionary (not calling API again)
data = data_dict[symbol]
print(data.head())


print(data.head())
data = add_technical_indicators(data)
X_selected, y_target = select_features(data)


print("X_selected\n",X_selected)
print("y_target\n",y_target)

merged_data = pd.concat([X_selected, y_target], axis=1).dropna().reset_index(drop=True)

actual, pred = lstm_forecast_multivariate(merged_data, target='close')

print("actual\n", actual)
print("pred\n", pred)

print("NaN count per column in merged_data:")
print(merged_data.isna().sum())

# Storing the volatility and cumulative return
initial_price = actual[0]
final_price = actual[-1]
cumulative_return = (final_price - initial_price) / initial_price
volatility = np.std(np.diff(actual) / actual[:-1])
stock_stats.append({
    'symbol': symbol,
    'cumulative_return': cumulative_return,
    'volatility': volatility
})

rmse = math.sqrt(mean_squared_error(actual, pred))
mae = mean_absolute_error(actual, pred)
results[symbol] = {'RMSE': rmse, 'MAE': mae}

# Plot Results
plt.figure()
plt.plot(actual, label='Actual')
plt.plot(pred, label='Predicted')
plt.title(f"{symbol} - LSTM Forecast")
plt.xlabel("Test Day")
plt.ylabel("Price (USD)")
plt.legend()
plt.tight_layout()
plt.show()


=== Processing: COPPER ===
  open high low close  volume
0    .    .   .     .       0
1    .    .   .     .       0
2    .    .   .     .       0
3    .    .   .     .       0
4    .    .   .     .       0
  open high low close  volume
0    .    .   .     .       0
1    .    .   .     .       0
2    .    .   .     .       0
3    .    .   .     .       0
4    .    .   .     .       0


DataError: No numeric types to aggregate

---

### Iterating over all the Symbols

In [None]:
for symbol in symbols:
    print(f"\n=== Processing: {symbol} ===")

    # data, _ = ts.get_daily(symbol=symbol, outputsize='full')
    # data = prepare_data(data)
    data = load_or_fetch_data(symbol, api_key)
    print(data.head())
    data = add_technical_indicators(data)
    X_selected, y_target = select_features(data)


    print("X_selected\n",X_selected)
    print("y_target\n",y_target)

    merged_data = pd.concat([X_selected, y_target], axis=1).dropna().reset_index(drop=True)

    actual, pred = lstm_forecast_multivariate(merged_data, target='close')

    print("actual\n", actual)
    print("pred\n", pred)

    print("NaN count per column in merged_data:")
    print(merged_data.isna().sum())

    # Storing the volatility and cumulative return
    initial_price = actual[0]
    final_price = actual[-1]
    cumulative_return = (final_price - initial_price) / initial_price
    volatility = np.std(np.diff(actual) / actual[:-1])
    stock_stats.append({
        'symbol': symbol,
        'cumulative_return': cumulative_return,
        'volatility': volatility
    })

    rmse = math.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    results[symbol] = {'RMSE': rmse, 'MAE': mae}

    # Plot Results
    plt.figure()
    plt.plot(actual, label='Actual')
    plt.plot(pred, label='Predicted')
    plt.title(f"{symbol} - LSTM Forecast")
    plt.xlabel("Test Day")
    plt.ylabel("Price (USD)")
    plt.legend()
    plt.tight_layout()
    plt.show()

### Calssifying Stocks using ABC Analysis

In [12]:
def classify_stocks_by_return(stock_stats_df):
    # Sort by cumulative return descending
    sorted_df = stock_stats_df.sort_values(by='cumulative_return', ascending=False).reset_index(drop=True)
    n = len(sorted_df)
    a_cutoff = int(0.2 * n)
    b_cutoff = int(0.5 * n)

    categories = ['A' if i < a_cutoff else 'B' if i < b_cutoff else 'C' for i in range(n)]
    sorted_df['ABC'] = categories
    return sorted_df

In [18]:
stock_stats_df = pd.DataFrame(stock_stats)
print("\nStock Statistics DataFrame:")
print(stock_stats_df)
classified_df = classify_stocks_by_return(stock_stats_df)
print("\nClassified Stocks by Return:")
print(classified_df)
abc_map = dict(zip(classified_df['symbol'], classified_df['ABC']))
print("\nABC Classification Map:")
print(abc_map)


Stock Statistics DataFrame:
  symbol  cumulative_return  volatility
0   AAPL          -0.375998    0.028050
1   MSFT           1.579285    0.017014
2  GOOGL          -0.924096    0.035665
3   TSLA          -0.565026    0.046570
4   AMZN          -0.914634    0.034790

Classified Stocks by Return:
  symbol  cumulative_return  volatility ABC
0   MSFT           1.579285    0.017014   A
1   AAPL          -0.375998    0.028050   B
2   TSLA          -0.565026    0.046570   C
3   AMZN          -0.914634    0.034790   C
4  GOOGL          -0.924096    0.035665   C

ABC Classification Map:
{'MSFT': 'A', 'AAPL': 'B', 'TSLA': 'C', 'AMZN': 'C', 'GOOGL': 'C'}


### Printing the Summary Report

In [36]:
print("\n==== Forecasting Performance Summary ====")
for sym, res in results.items():
    print(f"{sym}: RMSE = {res['RMSE']:.4f}, MAE = {res['MAE']:.4f}")


==== Forecasting Performance Summary ====
EURUSD: RMSE = 0.0102, MAE = 0.0076
