In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge  # Imported Ridge
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode
import matplotlib.pyplot as plt
init_notebook_mode(connected=True)

In [3]:
# Create the feature matrix X
def create_feature_matrix(data, lags=[1, 2]):
    X = pd.DataFrame()
    for lag in lags:
        for ticker in data.columns:
            X[f'{ticker}_lag_{lag}'] = data[ticker].shift(lag)
    X = X.dropna()
    return X

In [4]:
# Split data into training and testing sets
def split_data(X, y):
    mid_point = len(X) // 2
    X_train, X_test = X[:mid_point], X[mid_point:]
    y_train, y_test = y[:mid_point], y[mid_point:]
    return X_train, X_test, y_train, y_test

In [5]:
# Function to evaluate model for a given number of tickers
def evaluate_model(d, all_data):
    try:
        tickers_used = ticker_list[:d]
        print(f"Evaluating {d} tickers: {tickers_used}")
        data_used = all_data[tickers_used]
        
        # Create the feature matrix using all tickers up to d
        X = create_feature_matrix(data_used)
        y = all_data['GOOGL'][X.index]
        
        print(f"Data used shape: {data_used.shape}")
        print(f"Feature matrix shape: {X.shape}")

        if X.empty or y.empty:
            print(f"Skipping {d} tickers: Empty DataFrame")
            return None  # Skip iteration if the resulting DataFrame is empty
        
        # Adding additional features to X
        new_features = {}
        for ticker in tickers_used:
            if f'{ticker}_lag_1' in X.columns and f'{ticker}_lag_2' in X.columns:
                diff_squared = (X[f'{ticker}_lag_1'] - X[f'{ticker}_lag_2']) ** 2
                volatility = (X[f'{ticker}_lag_1'] - X[f'{ticker}_lag_2']) / X[f'{ticker}_lag_2']
                new_features[f'{ticker}_diff_squared'] = diff_squared
                new_features[f'{ticker}_volatility'] = volatility
        
        if new_features:
            new_features_df = pd.DataFrame(new_features, index=X.index)
            X = pd.concat([X, new_features_df], axis=1)
            X = X.copy()  # Create a de-fragmented copy of the DataFrame

        print(f"Feature matrix after adding new features shape: {X.shape}")

        # Splitting the data
        X_train, X_test, y_train, y_test = split_data(X, y)
        
        print(f"Train/Test split shapes: {X_train.shape}, {X_test.shape}")

        if X_train.empty or y_train.empty or X_test.empty or y_test.empty:
            print(f"Skipping {d} tickers: Empty train/test split")
            return None  # Skip iteration if the split results in empty DataFrames

        # Feature scaling
        scaler_X = StandardScaler().fit(X_train)
        X_train_scaled = scaler_X.transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)

        scaler_Y = StandardScaler().fit(y_train.values.reshape(-1, 1))
        y_train_scaled = scaler_Y.transform(y_train.values.reshape(-1, 1))
        y_test_scaled = scaler_Y.transform(y_test.values.reshape(-1, 1))

        # Machine learning model with regularization
        ridge = Ridge(alpha=1.0)  # Changed to Ridge regression
        ridge.fit(X_train_scaled, y_train_scaled)

        # Predict using the model
        predicted_train_scaled = ridge.predict(X_train_scaled)
        predicted_test_scaled = ridge.predict(X_test_scaled)

        # Compute Mean Squared Error (MSE)
        mse_train = np.mean(np.power(y_train_scaled - predicted_train_scaled, 2))
        mse_test = np.mean(np.power(y_test_scaled - predicted_test_scaled, 2))

        # Log MSE values
        print(f"MSE Train: {mse_train}, MSE Test: {mse_test}")

        # Ensure arrays are 2D before inverse transforming
        predicted_train_scaled = predicted_train_scaled.reshape(-1, 1)
        predicted_test_scaled = predicted_test_scaled.reshape(-1, 1)

        # Evaluate direction accuracy
        direction_accuracy_train = np.mean((np.sign(np.diff(y_train.values)) == np.sign(np.diff(scaler_Y.inverse_transform(predicted_train_scaled).flatten()))).astype(int))
        direction_accuracy_test = np.mean((np.sign(np.diff(y_test.values)) == np.sign(np.diff(scaler_Y.inverse_transform(predicted_test_scaled).flatten()))).astype(int))

        print(f"Processed {d} tickers")
        return {
            'Num_Tickers': d,
            'MSE_Train': mse_train,
            'MSE_Test': mse_test,
            'Direction_Accuracy_Train': direction_accuracy_train,
            'Direction_Accuracy_Test': direction_accuracy_test
        }
    except Exception as e:
        print(f"Error processing {d} tickers: {e}")
        return None

In [6]:
# List of ~100 tickers
ticker_list = ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD', 'TRV', 'CMG', 'AON', 'F', 'GM', 'APTV', 'APH', 'CDW', 'KMB', 'BSX']

In [7]:
# Download data for all tickers
all_data = yf.download(ticker_list, start='2010-06-29', end='2014-06-24')['Adj Close']

# Fill missing values and forward-fill any remaining missing data
all_data.fillna(method='ffill', inplace=True)
all_data.fillna(method='bfill', inplace=True)

[*********************100%%**********************]  94 of 94 completed

DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [8]:
# Evaluate models for different number of tickers in the feature matrix
results = []
for d in range(1, 101):
    result = evaluate_model(d, all_data)
    if result:
        results.append(result)

# Check the structure of results
print("Results Structure:", results)

Evaluating 1 tickers: ['GOOGL']
Data used shape: (1003, 1)
Feature matrix shape: (1001, 2)
Feature matrix after adding new features shape: (1001, 4)
Train/Test split shapes: (500, 4), (501, 4)
MSE Train: 0.03630975081205804, MSE Test: 0.09830208772218157
Processed 1 tickers
Evaluating 2 tickers: ['GOOGL', 'MSFT']
Data used shape: (1003, 2)
Feature matrix shape: (1001, 4)
Feature matrix after adding new features shape: (1001, 8)
Train/Test split shapes: (500, 8), (501, 8)
MSE Train: 0.03607311263309831, MSE Test: 0.1051828156788179
Processed 2 tickers
Evaluating 3 tickers: ['GOOGL', 'MSFT', 'AAPL']
Data used shape: (1003, 3)
Feature matrix shape: (1001, 6)
Feature matrix after adding new features shape: (1001, 12)
Train/Test split shapes: (500, 12), (501, 12)
MSE Train: 0.035956107536636764, MSE Test: 0.09195414348778262
Processed 3 tickers
Evaluating 4 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN']
Data used shape: (1003, 4)
Feature matrix shape: (1001, 8)
Feature matrix after adding new f


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Data used shape: (1003, 55)
Feature matrix shape: (1001, 110)
Feature matrix after adding new features shape: (1001, 220)
Train/Test split shapes: (500, 220), (501, 220)
MSE Train: 0.02283754603476751, MSE Test: 0.6930905573748085
Processed 55 tickers
Evaluating 56 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE']
Data used shape: (1003, 56)
Feature matrix shape: (1001, 112)
Feature matrix after adding new features shape: (1001, 224)
Train/Test split shapes: (500, 224), (501, 224)
MSE Train: 0.022698093011676485, MSE Test: 0.5473984788631409
Processed 56 tickers
Evaluating 57 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA'


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Feature matrix after adding new features shape: (1001, 236)
Train/Test split shapes: (500, 236), (501, 236)
MSE Train: 0.02175874644912176, MSE Test: 0.37325906839680656
Processed 59 tickers
Evaluating 60 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB']
Data used shape: (1003, 60)
Feature matrix shape: (1001, 120)
Feature matrix after adding new features shape: (1001, 240)
Train/Test split shapes: (500, 240), (501, 240)
MSE Train: 0.02137369660813102, MSE Test: 0.3287621712904219
Processed 60 tickers
Evaluating 61 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Data used shape: (1003, 63)
Feature matrix shape: (1001, 126)
Feature matrix after adding new features shape: (1001, 252)
Train/Test split shapes: (500, 252), (501, 252)
MSE Train: 0.0204731539885603, MSE Test: 0.4078514700592454
Processed 63 tickers
Evaluating 64 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS']
Data used shape: (1003, 64)
Feature matrix shape: (1001, 128)
Feature matrix after adding new features shape: (1001, 256)
Train/Test split shapes: (500, 256), (501, 256)
MSE Train: 0.020473153988560353, MSE Test: 0.4078514700592287
Processed 64 tickers
Evaluating 65 t


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Data used shape: (1003, 67)
Feature matrix shape: (1001, 134)
Feature matrix after adding new features shape: (1001, 268)
Train/Test split shapes: (500, 268), (501, 268)
MSE Train: 0.019949681394140127, MSE Test: 0.6838688913777929
Processed 67 tickers
Evaluating 68 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL']
Data used shape: (1003, 68)
Feature matrix shape: (1001, 136)
Feature matrix after adding new features shape: (1001, 272)
Train/Test split shapes: (500, 272), (501, 272)
MSE Train: 0.019857395965691204, MSE Test: 0.8488724176989968
Processed


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

MSE Train: 0.019189323336554657, MSE Test: 0.7486817776584218
Processed 70 tickers
Evaluating 71 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW']
Data used shape: (1003, 71)
Feature matrix shape: (1001, 142)
Feature matrix after adding new features shape: (1001, 284)
Train/Test split shapes: (500, 284), (501, 284)
MSE Train: 0.018834339069731375, MSE Test: 1.1514917984452784
Processed 71 tickers
Evaluating 72 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UN


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Feature matrix after adding new features shape: (1001, 292)
Train/Test split shapes: (500, 292), (501, 292)
MSE Train: 0.018568382573060583, MSE Test: 1.7625806149756602
Processed 73 tickers
Evaluating 74 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX']
Data used shape: (1003, 74)
Feature matrix shape: (1001, 148)
Feature matrix after adding new features shape: (1001, 296)
Train/Test split shapes: (500, 296), (501, 296)
MSE Train: 0.018528302372298897, MSE Test: 2.283313492264963
Processed 74 tickers
Evaluati


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

MSE Train: 0.017456469794194615, MSE Test: 2.185749708050126
Processed 76 tickers
Evaluating 77 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX']
Data used shape: (1003, 77)
Feature matrix shape: (1001, 154)
Feature matrix after adding new features shape: (1001, 308)
Train/Test split shapes: (500, 308), (501, 308)
MSE Train: 0.01731904223951099, MSE Test: 2.2772518438404488
Processed 77 tickers
Evaluating 78 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', '


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Data used shape: (1003, 80)
Feature matrix shape: (1001, 160)
Feature matrix after adding new features shape: (1001, 320)
Train/Test split shapes: (500, 320), (501, 320)
MSE Train: 0.01654130948576748, MSE Test: 2.142055667728994
Processed 80 tickers
Evaluating 81 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC']
Data used shape: (1003, 81)
Feature matrix shape: (1001, 162)
Feature matrix after adding new features shape: (1001, 324)
Train/Test split shapes: (50


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Data used shape: (1003, 83)
Feature matrix shape: (1001, 166)
Feature matrix after adding new features shape: (1001, 332)
Train/Test split shapes: (500, 332), (501, 332)
MSE Train: 0.01594479998663639, MSE Test: 2.662584668006045
Processed 83 tickers
Evaluating 84 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD']
Data used shape: (1003, 84)
Feature matrix shape: (1001, 168)
Feature matrix after adding new features shape: (1001, 336)
Train/Te


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Data used shape: (1003, 86)
Feature matrix shape: (1001, 172)
Feature matrix after adding new features shape: (1001, 344)
Train/Test split shapes: (500, 344), (501, 344)
MSE Train: 0.015478220914140021, MSE Test: 2.734943321607572
Processed 86 tickers
Evaluating 87 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD', 'TRV', 'CMG', 'AON']
Data used shape: (1003, 87)
Feature matrix shape: (1001, 174)
Feature matrix after adding new features shape


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Feature matrix after adding new features shape: (1001, 364)
Train/Test split shapes: (500, 364), (501, 364)
MSE Train: 0.014074417556279358, MSE Test: 9.812909251251446
Processed 91 tickers
Evaluating 92 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD', 'TRV', 'CMG', 'AON', 'F', 'GM', 'APTV', 'APH', 'CDW']
Data used shape: (1003, 92)
Feature matrix shape: (1001, 184)
Feature matrix after adding new features shape: (1001, 368)
Train/Test spli


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Feature matrix after adding new features shape: (1001, 376)
Train/Test split shapes: (500, 376), (501, 376)
MSE Train: 0.013876853238505876, MSE Test: 12.891664745253575
Processed 94 tickers
Evaluating 95 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD', 'TRV', 'CMG', 'AON', 'F', 'GM', 'APTV', 'APH', 'CDW', 'KMB', 'BSX']
Data used shape: (1003, 94)
Feature matrix shape: (1001, 188)
Feature matrix after adding new features shape: (1001, 376)



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Feature matrix after adding new features shape: (1001, 376)
Train/Test split shapes: (500, 376), (501, 376)
MSE Train: 0.013876853238505876, MSE Test: 12.891664745253575
Processed 97 tickers
Evaluating 98 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD', 'TRV', 'CMG', 'AON', 'F', 'GM', 'APTV', 'APH', 'CDW', 'KMB', 'BSX']
Data used shape: (1003, 94)
Feature matrix shape: (1001, 188)
Feature matrix after adding new features shape: (1001, 376)



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Feature matrix after adding new features shape: (1001, 376)
Train/Test split shapes: (500, 376), (501, 376)
MSE Train: 0.013876853238505876, MSE Test: 12.891664745253575
Processed 99 tickers
Evaluating 100 tickers: ['GOOGL', 'MSFT', 'AAPL', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'V', 'JPM', 'JNJ', 'WMT', 'PG', 'DIS', 'MA', 'UNH', 'HD', 'INTC', 'CSCO', 'PEP', 'VZ', 'KO', 'MRK', 'T', 'ABT', 'ABBV', 'PFE', 'NKE', 'ORCL', 'CRM', 'MCD', 'MS', 'COST', 'WFC', 'CVX', 'BA', 'RTX', 'IBM', 'GS', 'QCOM', 'HON', 'AMD', 'SBUX', 'ISRG', 'NOW', 'SPGI', 'BKNG', 'DHR', 'BLK', 'MDT', 'AMAT', 'CAT', 'DE', 'GE', 'GILD', 'ADBE', 'TMO', 'AXP', 'DUK', 'USB', 'AMGN', 'SYK', 'ADI', 'ZTS', 'CVS', 'C', 'COP', 'CL', 'ECL', 'APD', 'SHW', 'ICE', 'LMT', 'LRCX', 'ETN', 'FDX', 'VRTX', 'MPC', 'TGT', 'BMY', 'NSC', 'CI', 'REGN', 'DD', 'TRV', 'CMG', 'AON', 'F', 'GM', 'APTV', 'APH', 'CDW', 'KMB', 'BSX']
Data used shape: (1003, 94)
Feature matrix shape: (1001, 188)
Feature matrix after adding new features shape: (1001, 376)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [None]:
# Convert results to DataFrame
if results:
    results_df = pd.DataFrame(results)
    print("Results DataFrame Head:", results_df.head())

    # Save final results
    results_df.to_csv('final_results.csv', index=False)

    # Display the DataFrame in a Jupyter notebook environment
    display(results_df)

    # Plot Mean Squared Errors (MSE)
    plt.figure(figsize=(12, 8))
    plt.plot(results_df['Num_Tickers'], results_df['MSE_Train'], label='MSE Train', color='tab:blue', linestyle='dashed')
    plt.plot(results_df['Num_Tickers'], results_df['MSE_Test'], label='MSE Test', color='tab:blue', linestyle='solid')
    plt.xlabel('Number of Tickers in Feature Matrix')
    plt.ylabel('Mean Squared Error')
    plt.title('Mean Squared Error as a Function of Number of Tickers in Feature Matrix')
    plt.legend()
    plt.grid(True)
    plt.yscale('log')
    plt.show()

    # Plot Direction Accuracy
    plt.figure(figsize=(12, 8))
    plt.plot(results_df['Num_Tickers'], results_df['Direction_Accuracy_Train'], label='Direction Accuracy Train', color='tab:green', linestyle='dashed')
    plt.plot(results_df['Num_Tickers'], results_df['Direction_Accuracy_Test'], label='Direction Accuracy Test', color='tab:green', linestyle='solid')
    plt.xlabel('Number of Tickers in Feature Matrix')
    plt.ylabel('Direction Accuracy')
    plt.title('Direction Accuracy as a Function of Number of Tickers in Feature Matrix')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Plot only MSE Train
    plt.figure(figsize=(12, 8))
    plt.plot(results_df['Num_Tickers'], results_df['MSE_Train'], label='MSE Train', color='tab:blue')
    plt.xlabel('Number of Tickers in Feature Matrix')
    plt.ylabel('Mean Squared Error (Train)')
    plt.title('Mean Squared Error (Train) as a Function of Number of Tickers in Feature Matrix')
    plt.legend()
    plt.grid(True)
    plt.yscale('log')
    plt.show()
else:
    print("No results to display.")
