In [1]:
import sys
import os
import yaml

sys.path.append(os.getenv("CODE_PATH"))
sys.path.append(os.getenv("FIN_DATABASE_PATH"))


import plotly.graph_objects as go
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import ta
from log_config import setup_logging
from Data.connect import engine, DailyStockData, HourlyStockData, OneMinuteStockData, FiveMinuteStockData,FifteenMinuteStockData, StockSplits, StockNews, CompanyFinancials
from Pre_Processing.pre_processing import PreProcessing
from Feature_Engineering.feature_engineering import TechnicalIndicators
from pipeline import Pipeline

from sklearn.metrics import roc_auc_score, roc_curve, auc

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, log_loss, confusion_matrix, ConfusionMatrixDisplay

In [2]:
wiki = 'http://en.wikipedia.org/wiki'
djia_ticker_list = wiki + '/Dow_Jones_Industrial_Average'
sp500_tickers_list = wiki + '/List_of_S%26P_500_companies'
tickersSP500 = pd.read_html(sp500_tickers_list)[0].Symbol.to_list()
djia_tickers = pd.read_html(djia_ticker_list)[1].Symbol.to_list()

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [3]:
tickers = ['AAPL', 'MSFT', 'DIS', 'V', 'JPM']

In [4]:
pipe = Pipeline(tickers)

In [None]:
data = pipe.pipeline(timespan= 'hour')

In [None]:
combined_df = pd.concat(
    [df.assign(ticker=ticker).set_index('ticker', append=True) for ticker, df in data.items()]
)

combined_df = combined_df.reorder_levels(['date', 'ticker'])
combined_df.columns = combined_df.columns.str.lower()
combined_df.sort_index(inplace=True)
combined_df.index = pd.to_datetime(combined_df.index.get_level_values('date'))
combined_df


In [None]:
combined_df.info()

In [None]:
import seaborn as sns
sns.histplot(combined_df['rsi_14'], kde= True)

In [None]:
combined_df.columns

In [None]:
cols = ['rsi_14', 'rsi_2', 'macd', 'log_ret', 'return',
       'roc', 'stoch', 'adx', 'adx_pos', 'adx_neg', 'rolling_h-l_25']

In [None]:
#Examining aplle data further

apple_data = data['AAPL']

fig = go.Figure(data = go.Candlestick(x = apple_data.index,
                                      open = apple_data['open'],
                                      high = apple_data['high'],
                                      low = apple_data['low'],
                                      close = apple_data['close'],
                                      increasing=dict(line=dict(color='black')),
                                      decreasing=dict(line=dict(color='red')),
                                      showlegend=False))

layout = go.Layout(
    title=' Apple Stock Price - 2003 to 2024',
    yaxis=dict(title='Price (USD)'),
    xaxis=dict(title='Date'),
    template = 'ggplot2',
    xaxis_rangeslider_visible=False,
    yaxis_gridcolor='white',
    xaxis_gridcolor='white',
    yaxis_tickfont=dict(color='black'),
    xaxis_tickfont=dict(color='black'),
    margin=dict(t=50,l=50,r=50,b=50)
)

fig.update_layout(layout)


fig.show()

In [None]:
#Checking apple's returns

fig = go.Figure()

fig.add_trace(go.Scatter(x = apple_data.index,
                         y = apple_data['log_ret'].dropna(),
                         mode = 'lines',
                         line = dict(color = 'blue'),
                         name = 'Returns'))


layout = go.Layout(
    title=' Apple Stock Returns - 2003 to 2024',
    yaxis=dict(title='Returns (%)'),
    xaxis=dict(title='Date'),
    template='ggplot2',
    xaxis_rangeslider_visible=False,
    yaxis_gridcolor='white',
    xaxis_gridcolor='white',
    yaxis_tickfont=dict(color='black'),
    xaxis_tickfont=dict(color='black'),
    margin=dict(t=50, l=50, r=50, b=50)
)

fig.update_layout(layout)

fig.show()

In [None]:
import plotly.figure_factory as ff

fig = ff.create_distplot([apple_data['log_ret'].dropna()], ['Returns'], bin_size=0.001, show_rug=False)


fig.update_layout(
    title='Distribution of Apple Stock Returns',
    xaxis_title='Returns',
    yaxis_title='Density',
    template='ggplot2',
    margin=dict(t=50, l=50, r=50, b=50)
)

fig.show()



In [None]:
import scipy.stats as stats

# QQ plot
plt.figure(figsize=(8, 6))
stats.probplot(apple_data['log_ret'].dropna(), dist="norm", plot=plt)
plt.title('QQ Plot of Apple Stock Returns')
plt.show()


Returns exhibits fat tails 

## Models
Applying Decision Tree models

In [None]:
apple_data['target'] = np.where(apple_data['log_ret'] > 0,1,
                                0)

In [None]:
apple_data['target'] = apple_data['target'].shift(-1)
apple_data.dropna(subset= 'target', inplace= True)

In [None]:
value_counts = apple_data['target'].value_counts()
value_counts

In [None]:
sns.barplot(x=value_counts.index, y=value_counts.values)
plt.title('Distribution of Target Values')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

In [None]:
features = [ 'open', 'high', 'low', 'close',
       'volume', 'vwap', 'RSI_14', 'RSI_2', 'MACD', 'log_ret',
       'ROC', 'Stoch', 'ADX', 'ADX_pos', 'ADX_neg', 'rolling_H-L_25',
       'lower_band', 'ATR', 'IBS', 'OBV', 'ema_5', 'ema_10',
        'ema_30', 'ema_50', 'ema_100',
       'ema_200', 'ema_300']

## Cross Validation
Starting with only one 70/30 split for the sake of simplicity

In [None]:
train_test_split = 0.7
train_idx = int(len(apple_data) * train_test_split)

train_data = apple_data.iloc[:train_idx]
test_data = apple_data.iloc[train_idx:]

In [None]:
train_data[features]

In [None]:
X_train = train_data[features]
y_train = train_data['target']
X_test = test_data[features]
y_test = test_data['target']

In [None]:
#Testing different classifiers

models = [
    ("XGBoost", XGBClassifier()),
    ("CatBoost", CatBoostClassifier(verbose=False)),
    ("RandomForest", RandomForestClassifier())
]

In [None]:
plt.figure(figsize=(10, 10))
for name, model in models:
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1] 
    y_pred = model.predict(X_test) 
    
    # Calculating some Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    log_loss_value = log_loss(y_test, y_pred_proba)

    print(f'{name} ROC AUC: {roc_auc}')
    print(f'{name} Log Loss: {log_loss_value}')
    
    # ROC curve
    # fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    
    #CM
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{name} Confusion Matrix')
    plt.show()