In [1]:
import os 

In [2]:
os.getcwd()

'/atlas/data19/guhitj/Erdos_DL'

In [3]:
os.chdir('/atlas/data19/guhitj/Erdos_DL/Erdos_v2/Erdos-2024-DL-Newsworthy/models')

In [4]:
import pandas as pd
import data_engineering as de
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
from joblib import Parallel, delayed
import simulation
from math import sqrt
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings(action='ignore', category=np.VisibleDeprecationWarning)

In [5]:
df_dict = de.separate_by_stock()
df_dict = de.fillna(df_dict)

In [6]:
len(df_dict)

15

In [7]:
df_dict['AAPL'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1261 entries, 2019-03-15 to 2024-03-18
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   frob_neg        1261 non-null   float64
 1   frob_neu        1261 non-null   float64
 2   frob_pos        1261 non-null   float64
 3   frob_comp       1261 non-null   float64
 4   Open            1261 non-null   float64
 5   pos_art_count   1261 non-null   int64  
 6   neg_art_count   1261 non-null   int64  
 7   neu_art_count   1261 non-null   int64  
 8   total_articles  1261 non-null   int64  
 9   Open_Diff       1261 non-null   float64
 10  y               1261 non-null   float64
dtypes: float64(7), int64(4)
memory usage: 118.2 KB


In [8]:
def train_and_evaluate_trees_debug(model, param_grid, ticker, df, features):
    print(f"Processing {ticker}")
    try:
        if not pd.api.types.is_datetime64_any_dtype(df.index):
            df['date_column'] = pd.to_datetime(df['date_column'])
            df.set_index('date_column', inplace=True)

        train, test = de.train_test_split(df)
        train, test = train[features], test[features]
        #print(train.shape, test.shape)
        
        cv_splits = de.get_cv_splits(train)
        if not cv_splits:
            print("No CV splits available.")

        performances = []
        rmses = []
        accuracies = []
        #best_params = []
        #feature_ranks = {}
        i = 0

        for train_idx, test_idx in cv_splits:
            #print(f"CV Split {i}:")
            #print(f"Train indices: {train_idx}")
            #print(f"Test indices: {test_idx}")

            X_train, y_train = train.loc[train_idx].drop(columns=['y']), train.loc[train_idx]['y']
            X_test, y_test = train.loc[test_idx].drop(columns=['y']), train.loc[test_idx]['y']

            #print(X_train.shape, y_train.shape)
            #print(X_test.shape, y_test.shape)

            scaler_X = MinMaxScaler(feature_range=(-1, 1))
            scaler_y = MinMaxScaler(feature_range=(-1, 1))

            X_train_scaled = scaler_X.fit_transform(X_train)
            X_test_scaled = scaler_X.transform(X_test)
            y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1,)

            grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error')
            grid_search.fit(X_train_scaled, y_train_scaled)

            predictions = grid_search.best_estimator_.predict(X_test_scaled)
            predictions = scaler_y.inverse_transform(predictions.reshape(-1, 1)).reshape(-1,)

            rmse = sqrt(mean_squared_error(y_test, predictions))
            #pred_rise_fall = predictions / np.abs(predictions)
            #actual_rise_fall = y_test / np.abs(y_test)
            #accuracy = accuracy_score(actual_rise_fall, pred_rise_fall)
            accuracy = accuracy_score(np.sign(y_test), np.sign(predictions))

            print(f'Ticker: {ticker}, Split {i}, RMSE: {rmse:.3f}, Accuracy: {accuracy:.3f}')

            trading_signals = np.sign(predictions)
            test_opens = train.loc[test_idx, "Open"].values

            #print(f"trading signals for split {i}: {trading_signals}" )
            #print(f"Test open prices for split {i}: {test_opens}")

            rmses.append(rmse)
            accuracies.append(accuracy)
            performances.append((trading_signals, test_opens))
            #best_params.append(grid_search.best_params_)

            #features
            #selector = SelectFromModel(grid_search.best_estimator_, threshold=0.01, prefit=True)
            #feature_ranks[ticker] = [features[i] for i in selector.get_support(indices=True)]

            i += 1

        return ticker, performances, rmses, accuracies #, best_params, feature_ranks

    except Exception as e:
        print("Error encountered:", e)
        return None

In [9]:
def run_parallel_pipeline(ticker_frames, model, features, param_grid):
    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate_trees_debug)(model, param_grid, ticker, df, features)
        for ticker, df in ticker_frames.items()
    )
    return results

In [10]:
#features = ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles', 'Open_Diff', 'y']

features = ["frob_comp", "pos_art_count", "total_articles", "Open_Diff", "y", "Open"]
features_v2 = ["frob_comp", "frob_neg", "frob_neu", "frob_pos", "pos_art_count", "neg_art_count", "neu_art_count", "total_articles", "Open_Diff", "y", "Open"]

param_grid = {
    'n_estimators':   [100, 500] , #[100, 250, 300, 400, 600, 750, 900, 1000, 1200]
    'learning_rate': [0.001, 0.01], #  [0.001, 0.01]
    'max_depth':  [2, 6]  #[2, 8] 
}

In [11]:
model_gbt_v2 = GradientBoostingRegressor()
results_gbt_v2 = run_parallel_pipeline(df_dict, model_gbt_v2, features_v2, param_grid)

Processing WFC
Processing LLY
Processing JPM
Processing NVDA
Processing ABBV
Processing AMZN
Processing BAC
Processing MSFT
Processing MA
Processing UNH
Processing GOOGL
Processing AAPL
Processing JNJ
Processing V
Processing MRK
Ticker: UNH, Split 0, RMSE: 8.361, Accuracy: 0.594
Ticker: ABBV, Split 0, RMSE: 2.362, Accuracy: 0.594
Ticker: LLY, Split 0, RMSE: 4.731, Accuracy: 0.516
Ticker: MA, Split 0, RMSE: 6.906, Accuracy: 0.547
Ticker: MRK, Split 0, RMSE: 0.983, Accuracy: 0.641
Ticker: BAC, Split 0, RMSE: 0.725, Accuracy: 0.469
Ticker: WFC, Split 0, RMSE: 0.957, Accuracy: 0.500
Ticker: V, Split 0, RMSE: 4.221, Accuracy: 0.578
Ticker: JNJ, Split 0, RMSE: 1.724, Accuracy: 0.609
Ticker: JPM, Split 0, RMSE: 2.176, Accuracy: 0.516
Ticker: NVDA, Split 0, RMSE: 9.067, Accuracy: 0.469
Ticker: MSFT, Split 0, RMSE: 5.472, Accuracy: 0.547
Ticker: GOOGL, Split 0, RMSE: 2.858, Accuracy: 0.500
Ticker: AAPL, Split 0, RMSE: 3.604, Accuracy: 0.531
Ticker: AMZN, Split 0, RMSE: 4.505, Accuracy: 0.453
Ti

In [12]:
model_gbt = GradientBoostingRegressor()
results_gbt = run_parallel_pipeline(df_dict, model_gbt, features, param_grid)

Processing NVDA
Processing BAC
Processing MRK
Processing JPM
Processing UNH
Processing GOOGL
Processing WFC
Processing V
Processing LLY
Processing MA
Processing AMZN
Processing MSFT
Processing ABBV
Processing JNJ
Processing AAPL
Ticker: UNH, Split 0, RMSE: 8.361, Accuracy: 0.594
Ticker: ABBV, Split 0, RMSE: 2.361, Accuracy: 0.594
Ticker: LLY, Split 0, RMSE: 4.733, Accuracy: 0.516
Ticker: BAC, Split 0, RMSE: 0.726, Accuracy: 0.469
Ticker: V, Split 0, RMSE: 4.219, Accuracy: 0.609
Ticker: MRK, Split 0, RMSE: 0.983, Accuracy: 0.641
Ticker: MA, Split 0, RMSE: 6.901, Accuracy: 0.453
Ticker: JPM, Split 0, RMSE: 2.173, Accuracy: 0.516
Ticker: WFC, Split 0, RMSE: 0.956, Accuracy: 0.500
Ticker: NVDA, Split 0, RMSE: 9.067, Accuracy: 0.469
Ticker: JNJ, Split 0, RMSE: 1.724, Accuracy: 0.609
Ticker: MSFT, Split 0, RMSE: 5.472, Accuracy: 0.547
Ticker: AMZN, Split 0, RMSE: 4.361, Accuracy: 0.531
Ticker: GOOGL, Split 0, RMSE: 2.815, Accuracy: 0.469
Ticker: AAPL, Split 0, RMSE: 3.610, Accuracy: 0.516
Ti

In [13]:
cv_trades = [{}, {}, {}, {}]
cv_opens = [{}, {}, {}, {}]

# Extract tickers, performances, rmses, and direction accuracies from results
for result in results_gbt:
    ticker, performances, _, _ = result
    for i, (trading_signals, test_opens) in enumerate(performances):
        cv_trades[i][ticker] = trading_signals
        cv_opens[i][ticker] = test_opens
        #print(len(cv_trades[i][ticker]), len(cv_opens[i][ticker]) )

performance_porfolio = []
for i in range(4):
     performance_fold = simulation.get_performance(cv_trades[i], cv_opens[i])
     print(f'Fold {i+1} Porfolio Value: {performance_fold:.4f}')
     performance_porfolio.append(performance_fold)

average_portfolio_value = np.mean(performance_porfolio)
percent_growth = (average_portfolio_value - 1) * 100
print(f'Average Final Portfolio Value: {average_portfolio_value:.4f}')
print(f'Average Percent Growth: {percent_growth:.4f}%')


#print("cv_trades", cv_trades)
#print("cv_opens", cv_opens)


Fold 1 Porfolio Value: 0.9887
Fold 2 Porfolio Value: 0.9978
Fold 3 Porfolio Value: 1.0291
Fold 4 Porfolio Value: 1.0073
Average Final Portfolio Value: 1.0057
Average Percent Growth: 0.5745%


In [14]:
def calculate_performance(results):
    cv_trades = [{}, {}, {}, {}]
    cv_opens = [{}, {}, {}, {}]

    for result in results:
        ticker, performances, _, _= result
        for i, (trading_signals, test_opens) in enumerate(performances):
           cv_trades[i][ticker] = trading_signals
           cv_opens[i][ticker] = test_opens

    performance_porfolio = []
    for i in range(4):
       performance_fold = simulation.get_performance(cv_trades[i], cv_opens[i])
       print(f'Fold {i+1} Porfolio Value: {performance_fold:.4f}')
       performance_porfolio.append(performance_fold)
    
    average_portfolio_value = np.mean(performance_porfolio)
    percent_growth = (average_portfolio_value - 1) * 100
    print(f'Average Final Portfolio Value: {average_portfolio_value:.4f}')
    print(f'Average Percent Growth: {percent_growth:.4f}%')

    return performance_porfolio, average_portfolio_value, percent_growth

In [15]:
calculate_performance(results_gbt_v2)

Fold 1 Porfolio Value: 0.9872
Fold 2 Porfolio Value: 0.9934
Fold 3 Porfolio Value: 1.0141
Fold 4 Porfolio Value: 1.0038
Average Final Portfolio Value: 0.9996
Average Percent Growth: -0.0382%


([0.9872164344156628,
  0.993397073024697,
  1.0140628133422187,
  1.0037975748500154],
 0.9996184739081484,
 -0.03815260918516028)