In [1]:

import sys
import os
import yaml

sys.path.append(os.getenv("CODE_PATH"))
sys.path.append(os.getenv("FIN_DATABASE_PATH"))


import plotly.graph_objects as go
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import ta
from log_config import setup_logging
from Data.connect import engine, DailyStockData, HourlyStockData, OneMinuteStockData, FiveMinuteStockData,FifteenMinuteStockData, StockSplits, StockNews, CompanyFinancials
from Pre_Processing.pre_processing import PreProcessing
from Feature_Engineering.feature_engineering import TechnicalIndicators
from pipeline import Pipeline

from sklearn.metrics import roc_auc_score, roc_curve, auc

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, log_loss, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import TimeSeriesSplit

In [2]:
wiki = 'http://en.wikipedia.org/wiki'
djia_ticker_list = wiki + '/Dow_Jones_Industrial_Average'
sp500_tickers_list = wiki + '/List_of_S%26P_500_companies'
tickersSP500 = pd.read_html(sp500_tickers_list)[0].Symbol.to_list()
djia_tickers = pd.read_html(djia_ticker_list)[1].Symbol.to_list()

In [3]:
# tickers = ['AAPL', 'MSFT', 'DIS', 'V', 'JPM']

In [4]:
tickers = ['AAPL', 'MSFT']

In [5]:
pipe = Pipeline(tickers)

In [6]:
data = pipe.pipeline()

In [7]:
for df in data.values():
    df['target'] = np.where(df['log_ret'] > 0, 1, 0)
    df['target'] = df['target'].shift(-1)
    df.dropna(inplace=True)

In [8]:
# #Combining the data into a single dataframe

# combined_df = pd.concat(
#     [df.assign(ticker=ticker).set_index('ticker', append=True) for ticker, df in data.items()]
# )

# combined_df = combined_df.reorder_levels(['date', 'ticker'])


# combined_df.index = combined_df.index.set_levels(pd.to_datetime(combined_df.index.levels[0]), level=0)

# combined_df.sort_index(inplace=True)

# combined_df.columns = combined_df.columns.str.lower()
# combined_df.dropna(subset='target', inplace=True)
# combined_df

In [None]:
# combined_df['target'] = np.where(combined_df['log_ret'] > 0,1,
#                                 0)
# combined_df['target'] = combined_df['target'].shift(-1)

## Models
Applying Decision Tree models

In [10]:
features = [ 'open', 'high', 'low', 'close',
       'volume', 'vwap', 'RSI_14', 'RSI_2', 'MACD', 'log_ret',
       'ROC', 'Stoch', 'ADX', 'ADX_pos', 'ADX_neg', 'rolling_H-L_25',
       'lower_band', 'ATR', 'IBS', 'OBV', 'ema_5', 'ema_10',
        'ema_30', 'ema_50', 'ema_100',
       'ema_200', 'ema_300']
target = 'target'

In [11]:
# features = [feature.lower() for feature in features]

## Cross Validation

In [12]:
#Testing different classifiers

models = [
    ("XGBoost", XGBClassifier()),
    ("CatBoost", CatBoostClassifier(verbose=False)),
    ("RandomForest", RandomForestClassifier())
]

In [13]:
tcsv= TimeSeriesSplit(n_splits=5)

In [14]:
roc_auc_scores = {model_name: {} for model_name, _ in models}
log_loss_scores = {model_name: {} for model_name, _ in models}

In [17]:
results = {model_name: {} for model_name, _ in models}

for ticker, df in data.items():
    for model_name, _ in models:
        results[model_name][ticker] = {
            'roc_auc': [],
            'log_loss': []
        }
    
    fold_number = 1
    for train_index, test_index in tcsv.split(df):
        X_train, X_test = df[features].iloc[train_index], df[features].iloc[test_index]
        y_train, y_test = df['target'].iloc[train_index], df['target'].iloc[test_index]
        
        for model_name, model in models:
            model.fit(X_train, y_train)
            y_proba = model.predict_proba(X_test)[:, 1]
            
            roc_auc = roc_auc_score(y_test, y_proba)
            log_loss_value = log_loss(y_test, model.predict_proba(X_test))
            
            # Store the results for each fold
            results[model_name][ticker]['roc_auc'].append((fold_number, roc_auc))
            results[model_name][ticker]['log_loss'].append((fold_number, log_loss_value))
        
        fold_number += 1


In [18]:
for model_name, tickers in results.items():
    print(f"Results for Model: {model_name}")
    for ticker, metrics in tickers.items():
        print(f"  Ticker: {ticker}")
        for fold, roc_auc in metrics['roc_auc']:
            log_loss_value = [ll for f, ll in metrics['log_loss'] if f == fold][0]
            print(f"    Fold {fold}: ROC AUC = {roc_auc:.4f}, Log Loss = {log_loss_value:.4f}")
        print()


Results for Model: XGBoost
  Ticker: AAPL
    Fold 1: ROC AUC = 0.5029, Log Loss = 1.2370
    Fold 2: ROC AUC = 0.4882, Log Loss = 1.0915
    Fold 3: ROC AUC = 0.5365, Log Loss = 0.8905
    Fold 4: ROC AUC = 0.5352, Log Loss = 1.0653
    Fold 5: ROC AUC = 0.5025, Log Loss = 1.0064

  Ticker: MSFT
    Fold 1: ROC AUC = 0.5659, Log Loss = 0.9808
    Fold 2: ROC AUC = 0.5214, Log Loss = 1.0000
    Fold 3: ROC AUC = 0.5379, Log Loss = 1.0222
    Fold 4: ROC AUC = 0.5359, Log Loss = 0.9920
    Fold 5: ROC AUC = 0.5336, Log Loss = 0.9623

Results for Model: CatBoost
  Ticker: AAPL
    Fold 1: ROC AUC = 0.4992, Log Loss = 0.7738
    Fold 2: ROC AUC = 0.4907, Log Loss = 0.8128
    Fold 3: ROC AUC = 0.5243, Log Loss = 0.7314
    Fold 4: ROC AUC = 0.5246, Log Loss = 0.7711
    Fold 5: ROC AUC = 0.4837, Log Loss = 0.7594

  Ticker: MSFT
    Fold 1: ROC AUC = 0.5628, Log Loss = 0.7121
    Fold 2: ROC AUC = 0.5312, Log Loss = 0.7259
    Fold 3: ROC AUC = 0.5313, Log Loss = 0.7330
    Fold 4: ROC AU