In [142]:
import pandas as pd
import data_engineering as de
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
from joblib import Parallel, delayed
import simulation
from math import sqrt
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings(action='ignore', category=np.VisibleDeprecationWarning)

In [3]:
df_dict = de.separate_by_stock()
df_dict = de.fillna(df_dict)

In [173]:
def train_and_evaluate_trees_debug(model, param_grid, ticker, df, features):
    print(f"Processing {ticker}")
    try:
        if not pd.api.types.is_datetime64_any_dtype(df.index):
            df['date_column'] = pd.to_datetime(df['date_column'])
            df.set_index('date_column', inplace=True)

        train, test = de.train_test_split(df)
        train, test = train[features], test[features]
        #print(train.shape, test.shape)
        
        cv_splits = de.get_cv_splits(train)
        if not cv_splits:
            print("No CV splits available.")

        performances = []
        rmses = []
        accuracies = []
        #best_params = []
        #feature_ranks = {}
        i = 0

        for train_idx, test_idx in cv_splits:
            #print(f"CV Split {i}:")
            #print(f"Train indices: {train_idx}")
            #print(f"Test indices: {test_idx}")

            X_train, y_train = train.loc[train_idx].drop(columns=['y']), train.loc[train_idx]['y']
            X_test, y_test = train.loc[test_idx].drop(columns=['y']), train.loc[test_idx]['y']

            #print(X_train.shape, y_train.shape)
            #print(X_test.shape, y_test.shape)

            scaler_X = MinMaxScaler(feature_range=(-1, 1))
            scaler_y = MinMaxScaler(feature_range=(-1, 1))

            X_train_scaled = scaler_X.fit_transform(X_train)
            X_test_scaled = scaler_X.transform(X_test)
            y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1,)

            grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error')
            grid_search.fit(X_train_scaled, y_train_scaled)

            predictions = grid_search.best_estimator_.predict(X_test_scaled)
            predictions = scaler_y.inverse_transform(predictions.reshape(-1, 1)).reshape(-1,)

            rmse = sqrt(mean_squared_error(y_test, predictions))
            #pred_rise_fall = predictions / np.abs(predictions)
            #actual_rise_fall = y_test / np.abs(y_test)
            #accuracy = accuracy_score(actual_rise_fall, pred_rise_fall)
            accuracy = accuracy_score(np.sign(y_test), np.sign(predictions))

            print(f'Ticker: {ticker}, Split {i}, RMSE: {rmse:.3f}, Accuracy: {accuracy:.3f}')

            trading_signals = np.sign(predictions)
            test_opens = train.loc[test_idx, "Open"].values

            #print(f"trading signals for split {i}: {trading_signals}" )
            #print(f"Test open prices for split {i}: {test_opens}")

            rmses.append(rmse)
            accuracies.append(accuracy)
            performances.append((trading_signals, test_opens))
            #best_params.append(grid_search.best_params_)

            #features
            #selector = SelectFromModel(grid_search.best_estimator_, threshold=0.01, prefit=True)
            #feature_ranks[ticker] = [features[i] for i in selector.get_support(indices=True)]

            i += 1

        return ticker, performances, rmses, accuracies #, best_params, feature_ranks

    except Exception as e:
        print("Error encountered:", e)
        return None


In [174]:
def run_parallel_pipeline(ticker_frames, model, features, param_grid):
    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate_trees_debug)(model, param_grid, ticker, df, features)
        for ticker, df in ticker_frames.items()
    )
    return results

In [175]:
features = ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles', 'Open_Diff', 'y']

param_grid = {
    'n_estimators':   [100, 500] , #[100, 250, 300, 400, 600, 750, 900, 1000, 1200]
    'learning_rate': [0.001, 0.01], #  [0.001, 0.01]
    'max_depth':  [2, 6]  #[2, 8] 
}

In [176]:
model_gbt = GradientBoostingRegressor()
results_gbt = run_parallel_pipeline(df_dict, model_gbt, features, param_grid)

Processing ABBV
Processing AAPL
Processing GOOGL
Processing JNJ
Processing BAC
Processing AMZN
Processing MA
Processing MSFT
Processing MRK
Processing LLY
Processing JPM
Processing NVDA
Ticker: LLY, Split 0, RMSE: 4.693, Accuracy: 0.547
Ticker: ABBV, Split 0, RMSE: 2.368, Accuracy: 0.594
Ticker: BAC, Split 0, RMSE: 0.727, Accuracy: 0.469
Ticker: MA, Split 0, RMSE: 5.428, Accuracy: 0.688
Ticker: JPM, Split 0, RMSE: 2.177, Accuracy: 0.562
Ticker: MRK, Split 0, RMSE: 2.797, Accuracy: 0.531
Ticker: JNJ, Split 0, RMSE: 1.653, Accuracy: 0.641
Ticker: NVDA, Split 0, RMSE: 9.070, Accuracy: 0.469
Ticker: MSFT, Split 0, RMSE: 5.464, Accuracy: 0.562
Ticker: GOOGL, Split 0, RMSE: 2.853, Accuracy: 0.500
Ticker: AAPL, Split 0, RMSE: 3.563, Accuracy: 0.562
Ticker: AMZN, Split 0, RMSE: 3.998, Accuracy: 0.703
Ticker: LLY, Split 1, RMSE: 5.718, Accuracy: 0.469
Ticker: ABBV, Split 1, RMSE: 1.981, Accuracy: 0.516
Ticker: BAC, Split 1, RMSE: 0.682, Accuracy: 0.406
Ticker: MA, Split 1, RMSE: 5.850, Accuracy

In [185]:
model_xgb = XGBRegressor()
results_xgb = run_parallel_pipeline(df_dict, model_xgb, features, param_grid)

Processing AAPL
Processing AMZN
Processing BAC
Processing ABBV
Processing GOOGL
Processing JNJ
Processing MSFT
Processing MA
Processing MRK
Processing LLY
Processing NVDA
Processing JPM
Ticker: ABBV, Split 0, RMSE: 2.364, Accuracy: 0.609
Ticker: LLY, Split 0, RMSE: 4.688, Accuracy: 0.562
Ticker: BAC, Split 0, RMSE: 0.729, Accuracy: 0.516
Ticker: GOOGL, Split 0, RMSE: 2.843, Accuracy: 0.500
Ticker: NVDA, Split 0, RMSE: 9.059, Accuracy: 0.469
Ticker: AAPL, Split 0, RMSE: 3.607, Accuracy: 0.516
Ticker: MA, Split 0, RMSE: 5.558, Accuracy: 0.734
Ticker: JPM, Split 0, RMSE: 2.173, Accuracy: 0.547
Ticker: MRK, Split 0, RMSE: 1.440, Accuracy: 0.562
Ticker: AMZN, Split 0, RMSE: 3.811, Accuracy: 0.688
Ticker: MSFT, Split 0, RMSE: 5.483, Accuracy: 0.547
Ticker: JNJ, Split 0, RMSE: 1.839, Accuracy: 0.516
Ticker: ABBV, Split 1, RMSE: 1.981, Accuracy: 0.516
Ticker: LLY, Split 1, RMSE: 5.739, Accuracy: 0.453
Ticker: BAC, Split 1, RMSE: 0.671, Accuracy: 0.453
Ticker: NVDA, Split 1, RMSE: 6.232, Accura

In [183]:
def calculate_performance(results):
    cv_trades = [{}, {}, {}, {}]
    cv_opens = [{}, {}, {}, {}]

    for result in results:
        ticker, performances, _, _ = result
        for i, (trading_signals, test_opens) in enumerate(performances):
           cv_trades[i][ticker] = trading_signals
           cv_opens[i][ticker] = test_opens

    performance_porfolio = []
    for i in range(4):
       performance_fold = simulation.get_performance(cv_trades[i], cv_opens[i])
       print(f'Fold {i+1} Porfolio Value: {performance_fold:.4f}')
       performance_porfolio.append(performance_fold)
    
    average_portfolio_value = np.mean(performance_porfolio)
    percent_growth = (average_portfolio_value - 1) * 100
    print(f'Average Final Portfolio Value: {average_portfolio_value:.4f}')
    print(f'Average Percent Growth: {percent_growth:.4f}%')

    return performance_porfolio, average_portfolio_value, percent_growth
        


In [184]:
calculate_performance(results_gbt)

Fold 1 Porfolio Value: 1.0498
Fold 2 Porfolio Value: 1.0560
Fold 3 Porfolio Value: 1.0577
Fold 4 Porfolio Value: 1.0638
Average Final Portfolio Value: 1.0568
Average Percent Growth: 5.6842%


([1.049842012027107,
  1.0560350443894293,
  1.0577068867162116,
  1.0637836134556287],
 1.056841889147094,
 5.684188914709409)

In [186]:
calculate_performance(results_xgb)

Fold 1 Porfolio Value: 1.0346
Fold 2 Porfolio Value: 1.0467
Fold 3 Porfolio Value: 1.0523
Fold 4 Porfolio Value: 1.0840
Average Final Portfolio Value: 1.0544
Average Percent Growth: 5.4390%


([1.034605425226804,
  1.0466591182993161,
  1.0523039416307731,
  1.0839918275149545],
 1.054390078167962,
 5.439007816796204)

In [182]:
cv_trades = [{}, {}, {}, {}]
cv_opens = [{}, {}, {}, {}]

# Extract tickers, performances, rmses, and direction accuracies from results
for result in results_gbt:
    ticker, performances, _, _ = result
    for i, (trading_signals, test_opens) in enumerate(performances):
        cv_trades[i][ticker] = trading_signals
        cv_opens[i][ticker] = test_opens

performance_porfolio = []
for i in range(4):
     performance_fold = simulation.get_performance(cv_trades[i], cv_opens[i])
     print(f'Fold {i+1} Porfolio Value: {performance_fold:.4f}')
     performance_porfolio.append(performance_fold)

average_portfolio_value = np.mean(performance_porfolio)
percent_growth = (average_portfolio_value - 1) * 100
print(f'Average Final Portfolio Value: {average_portfolio_value:.4f}')
print(f'Average Percent Growth: {percent_growth:.4f}%')

Fold 1 Porfolio Value: 1.0498
Fold 2 Porfolio Value: 1.0560
Fold 3 Porfolio Value: 1.0577
Fold 4 Porfolio Value: 1.0638
Average Final Portfolio Value: 1.0568
Average Percent Growth: 5.6842%
