In [None]:
import alpaca_trade_api as alpaca
from alpaca.trading.client import TradingClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.historical.stock import StockHistoricalDataClient
from alpaca.data.timeframe import TimeFrame
from dotenv import load_dotenv
import yfinance as yf
import os
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score
import joblib

In [None]:
paper = True

# initialize API from API keys in .env
load_dotenv()

if paper:
    api_key = os.environ['APCA-API-PAPER-KEY-ID']
    api_secret_key = os.environ['APCA-API-PAPER-SECRET-KEY']
    api_base_url = 'https://paper-api.alpaca.markets'
else:
    api_key = os.environ['APCA-API-KEY-ID']
    api_secret_key = os.environ['APCA-API-SECRET-KEY']
    api_base_url = 'https://api.alpaca.markets'

api = alpaca.REST(api_key, api_secret_key, api_base_url)
account = api.get_account()
trading_client = TradingClient(api_key, api_secret_key, paper=paper)
data_client = StockHistoricalDataClient(api_key, api_secret_key)

In [None]:
def get_SP500():
    market_caps = []
    SP500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
    #SP500 = SP500[0:20]
    for index, row in SP500.iterrows():
        print(f"{index}: {row['Symbol']}")
        query_attempts = 0
        market_cap = 0
        while (True):
            query_attempts += 1
            if query_attempts >= 3:
                raise Exception('query failed')
            try:
                market_cap = yf.Ticker(row['Symbol']).info.get("marketCap")
                break
            except:
                time.sleep(3)
        market_caps.append(market_cap)
        print(market_cap)
        print()
        time.sleep(1)
    SP500["Market Cap"] = market_caps
    return SP500

def get_SP(n=500):
    return pd.read_csv('SP500.csv')[0:n].sort_values(by='Symbol')

def get_weekly_stock_bars(symbols, start=dt.datetime(2019, 1, 1), filename='output.csv'):
    request_params = StockBarsRequest(
        symbol_or_symbols=symbols,
        timeframe=TimeFrame.Day,
        start=start,
        adjustment='all'
    )
    bars = repeated_get_stock_bars(request_params)
    df = bars.df
    df = df[df.index.get_level_values('timestamp').day_name() == 'Friday']
    df['log return'] = np.log(df['close'] / df['close'].groupby(level=0).shift(1))
    df = df.dropna(subset=['log return'])
    df = df[['close', 'log return']]
    df = df.swaplevel('timestamp', 'symbol')
    df = df.sort_index(level=['timestamp', 'symbol'], ascending=[True, True])

    valid_symbols = df.groupby('symbol').size()
    valid_symbols = valid_symbols[valid_symbols == df.index.get_level_values(level=0).nunique()].index
    df = df[df.index.get_level_values(level=1).isin(valid_symbols)]
    df.to_csv(filename)

def repeated_get_stock_bars(request_params):
    query_attempts = 0
    while (True):
        query_attempts += 1
        if query_attempts >= 3:
            raise Exception('query failed')
        try:
            return data_client.get_stock_bars(request_params)
        except:
            time.sleep(3)

In [None]:
SP50 = get_SP(100)
get_weekly_stock_bars(SP50['Symbol'])

In [None]:
df = pd.read_csv('output.csv')
display(df)
plt.scatter(x=df[df['symbol'] == 'AAPL']['timestamp'], y=df[df['symbol'] == 'AAPL']['log return'])

In [None]:
df_pivot = df.pivot(index='timestamp', columns='symbol', values='log return')
y = df_pivot['NVDA'].values
X = df_pivot.drop('NVDA', axis=1)
X = X.to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Adjusted range for alpha and l1_ratio
param_grid = {
    'alpha': [0.001, 0.005, 0.01, 0.05, 0.1],  # Try smaller alpha values
    'l1_ratio': [0.05, 0.1, 0.2, 0.35, 0.5, 0.65, 0.8, 0.9, 0.95]     # Try a mix between Ridge and Lasso
}

# Set up the ElasticNet model
elasticnet = ElasticNet()

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(elasticnet, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters found
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']
print(f"Best alpha: {best_alpha}, Best l1_ratio: {best_l1_ratio}")

# Fit the ElasticNet model with the best parameters found
model = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluate with Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error with best parameters: {mse:.4f}')

# Save the model to a file
joblib.dump(model, 'model.pkl')

# Save the predictions to a CSV file
results = pd.DataFrame({'True Values': y_test, 'Predictions': y_pred})
results.to_csv('predictions.csv', index=False)

print("Predictions saved!")

In [None]:
preds = pd.read_csv('predictions.csv')
display(preds)
plt.scatter(x=preds['True Values'], y=preds['Predictions'])
r2 = r2_score(preds['True Values'], preds['Predictions'])
print(r2)
mape = np.mean(np.abs(preds['True Values'] - preds['Predictions']) / preds['True Values']) * 100
print(f"Mean Absolute Percentage Error: {mape:.4f}%")