In [2]:
import os 

In [3]:
os.getcwd()

'/atlas/data19/guhitj/Erdos_DL'

In [4]:
os.chdir('/atlas/data19/guhitj/Erdos_DL/Erdos_v2/Erdos-2024-DL-Newsworthy/models')

In [5]:
import pandas as pd
import data_engineering_orig as de_orig
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
from joblib import Parallel, delayed
import simulation
from math import sqrt
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings(action='ignore', category=np.VisibleDeprecationWarning)

In [6]:
df_dict = de_orig.separate_by_stock()
df_dict = de_orig.fillna(df_dict)

In [14]:
len(df_dict)

15

In [15]:
df_dict['AAPL'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1261 entries, 2019-03-15 to 2024-03-18
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   finvader_neg    1261 non-null   float64
 1   finvader_neu    1261 non-null   float64
 2   finvader_pos    1261 non-null   float64
 3   finvader_tot    1261 non-null   float64
 4   Open            1261 non-null   float64
 5   pos_art_count   1261 non-null   int64  
 6   neg_art_count   1261 non-null   int64  
 7   neu_art_count   1261 non-null   int64  
 8   total_articles  1261 non-null   int64  
 9   Open_Diff       1261 non-null   float64
 10  y               1261 non-null   float64
dtypes: float64(7), int64(4)
memory usage: 118.2 KB


In [7]:
def train_and_evaluate_trees_debug(model, param_grid, ticker, df, features):
    print(f"Processing {ticker}")
    try:
        if not pd.api.types.is_datetime64_any_dtype(df.index):
            df['date_column'] = pd.to_datetime(df['date_column'])
            df.set_index('date_column', inplace=True)

        train, test = de_orig.train_test_split(df)
        train, test = train[features], test[features]
        #print(train.shape, test.shape)
        
        cv_splits = de_orig.get_cv_splits(train)
        if not cv_splits:
            print("No CV splits available.")

        performances = []
        rmses = []
        accuracies = []
        #best_params = []
        #feature_ranks = {}
        i = 0

        for train_idx, test_idx in cv_splits:
            #print(f"CV Split {i}:")
            #print(f"Train indices: {train_idx}")
            #print(f"Test indices: {test_idx}")

            X_train, y_train = train.loc[train_idx].drop(columns=['y']), train.loc[train_idx]['y']
            X_test, y_test = train.loc[test_idx].drop(columns=['y']), train.loc[test_idx]['y']

            #print(X_train.shape, y_train.shape)
            #print(X_test.shape, y_test.shape)

            scaler_X = MinMaxScaler(feature_range=(-1, 1))
            scaler_y = MinMaxScaler(feature_range=(-1, 1))

            X_train_scaled = scaler_X.fit_transform(X_train)
            X_test_scaled = scaler_X.transform(X_test)
            y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1,)

            grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error')
            grid_search.fit(X_train_scaled, y_train_scaled)

            predictions = grid_search.best_estimator_.predict(X_test_scaled)
            predictions = scaler_y.inverse_transform(predictions.reshape(-1, 1)).reshape(-1,)

            rmse = sqrt(mean_squared_error(y_test, predictions))
            #pred_rise_fall = predictions / np.abs(predictions)
            #actual_rise_fall = y_test / np.abs(y_test)
            #accuracy = accuracy_score(actual_rise_fall, pred_rise_fall)
            accuracy = accuracy_score(np.sign(y_test), np.sign(predictions))

            print(f'Ticker: {ticker}, Split {i}, RMSE: {rmse:.3f}, Accuracy: {accuracy:.3f}')

            trading_signals = np.sign(predictions)
            test_opens = train.loc[test_idx, "Open"].values

            #print(f"trading signals for split {i}: {trading_signals}" )
            #print(f"Test open prices for split {i}: {test_opens}")

            rmses.append(rmse)
            accuracies.append(accuracy)
            performances.append((trading_signals, test_opens))
            #best_params.append(grid_search.best_params_)

            #features
            #selector = SelectFromModel(grid_search.best_estimator_, threshold=0.01, prefit=True)
            #feature_ranks[ticker] = [features[i] for i in selector.get_support(indices=True)]

            i += 1

        return ticker, performances, rmses, accuracies #, best_params, feature_ranks

    except Exception as e:
        print("Error encountered:", e)
        return None

In [8]:
def run_parallel_pipeline(ticker_frames, model, features, param_grid):
    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate_trees_debug)(model, param_grid, ticker, df, features)
        for ticker, df in ticker_frames.items()
    )
    return results

In [9]:
features = ["finvader_tot", "pos_art_count", "total_articles", "Open_Diff", "y", "Open"]

param_grid = {
    'n_estimators':   [100, 500] , #[100, 250, 300, 400, 600, 750, 900, 1000, 1200]
    'learning_rate': [0.001, 0.01], #  [0.001, 0.01]
    'max_depth':  [2, 6]  #[2, 8] 
}

In [10]:
model_gbt = GradientBoostingRegressor()
results_gbt = run_parallel_pipeline(df_dict, model_gbt, features, param_grid)

Processing MSFT
Processing NVDA
Processing ABBV
Processing JPM
Processing BAC
Processing LLY
Processing AAPL
Processing MRK
Processing WFC
Processing MA
Processing AMZN
Processing UNH
Processing JNJ
Processing GOOGL
Processing V
Ticker: UNH, Split 0, RMSE: 8.365, Accuracy: 0.578
Ticker: LLY, Split 0, RMSE: 4.691, Accuracy: 0.562
Ticker: BAC, Split 0, RMSE: 0.726, Accuracy: 0.469
Ticker: ABBV, Split 0, RMSE: 2.370, Accuracy: 0.594
Ticker: MRK, Split 0, RMSE: 0.983, Accuracy: 0.641
Ticker: WFC, Split 0, RMSE: 0.955, Accuracy: 0.484
Ticker: JNJ, Split 0, RMSE: 1.724, Accuracy: 0.609
Ticker: MA, Split 0, RMSE: 6.901, Accuracy: 0.453
Ticker: V, Split 0, RMSE: 4.222, Accuracy: 0.578
Ticker: JPM, Split 0, RMSE: 2.177, Accuracy: 0.516
Ticker: NVDA, Split 0, RMSE: 9.067, Accuracy: 0.469
Ticker: MSFT, Split 0, RMSE: 5.471, Accuracy: 0.531
Ticker: GOOGL, Split 0, RMSE: 2.853, Accuracy: 0.500
Ticker: AMZN, Split 0, RMSE: 4.260, Accuracy: 0.469
Ticker: AAPL, Split 0, RMSE: 3.618, Accuracy: 0.500
Ti

In [13]:
cv_trades = [{}, {}, {}, {}]
cv_opens = [{}, {}, {}, {}]

# Extract tickers, performances, rmses, and direction accuracies from results
for result in results_gbt:
    ticker, performances, _, _ = result
    for i, (trading_signals, test_opens) in enumerate(performances):
        cv_trades[i][ticker] = trading_signals
        cv_opens[i][ticker] = test_opens
        print(ticker, len(cv_trades[i][ticker]), len(cv_opens[i][ticker]))

performance_porfolio = []
for i in range(4):
     performance_fold = simulation.get_performance(cv_trades[i], cv_opens[i])
     print(f'Fold {i+1} Porfolio Value: {performance_fold:.4f}')
     performance_porfolio.append(performance_fold)

average_portfolio_value = np.mean(performance_porfolio)
percent_growth = (average_portfolio_value - 1) * 100
print(f'Average Final Portfolio Value: {average_portfolio_value:.4f}')
print(f'Average Percent Growth: {percent_growth:.4f}%')

AAPL 64 64
AAPL 64 64
AAPL 63 63
AAPL 60 60
ABBV 64 64
ABBV 64 64
ABBV 63 63
ABBV 60 60
AMZN 64 64
AMZN 64 64
AMZN 63 63
AMZN 60 60
BAC 64 64
BAC 64 64
BAC 63 63
BAC 60 60
GOOGL 64 64
GOOGL 64 64
GOOGL 63 63
GOOGL 60 60
JNJ 64 64
JNJ 64 64
JNJ 63 63
JNJ 60 60
JPM 64 64
JPM 64 64
JPM 63 63
JPM 60 60
LLY 64 64
LLY 64 64
LLY 63 63
LLY 60 60
MA 64 64
MA 64 64
MA 63 63
MA 60 60
MRK 64 64
MRK 64 64
MRK 63 63
MRK 60 60
MSFT 64 64
MSFT 64 64
MSFT 63 63
MSFT 60 60
NVDA 64 64
NVDA 64 64
NVDA 63 63
NVDA 60 60
UNH 64 64
UNH 64 64
UNH 63 63
UNH 60 60
V 64 64
V 64 64
V 63 63
V 60 60
WFC 64 64
WFC 64 64
WFC 63 63
WFC 60 60
Fold 1 Porfolio Value: 0.9746
Fold 2 Porfolio Value: 1.0023
Fold 3 Porfolio Value: 1.0187
Fold 4 Porfolio Value: 1.0053
Average Final Portfolio Value: 1.0002
Average Percent Growth: 0.0205%
