In [None]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Data Preprocessing


In [None]:
setDate = "2023-01-01"
endDate = datetime.now().date().strftime('%Y-%m-%d')

# Original Data From Yahoo Finance


In [None]:
# Fetch historical data for QQQ
ticker = "QQQ"
data = yf.download(ticker, start=setDate)
data = data[['Open', 'Close', 'High', 'Low']]
data.columns = data.columns.droplevel('Ticker')

[*********************100%***********************]  1 of 1 completed


# Other Indicators

In [None]:
# Indicator1: Daily return
data['Daily Return'] = data['Close'].pct_change()

# Indicator2: Volatility - rolling standard deviation of returns over a 5-day window
data['Volatility'] = data['Daily Return'].rolling(window=5).std()*np.sqrt(5)

# Indicator3: Simple Moving Average (SMA)
data['SMA_5'] = data['Close'].rolling(window=5).mean()  # 5-day SMA
data['SMA_21'] = data['Close'].rolling(window=21).mean()  # 21-day SMA

# Indicator4: Exponential Moving Average (EMA)
data['EMA_5'] = data['Close'].ewm(span=5, adjust=False).mean()  # 5-day EMA
data['EMA_21'] = data['Close'].ewm(span=21, adjust=False).mean()  # 21-day EMA

# Calculate True Range (TR)
data['TR1'] = data['High'] - data['Low']
data['TR2'] = abs(data['High'] - data['Close'].shift(1))
data['TR3'] = abs(data['Low'] - data['Close'].shift(1))
data['True Range'] = data[['TR1', 'TR2', 'TR3']].max(axis=1)
# Indicator5: Average True Range (ATR)
atr_period = 5
data['ATR'] = data['True Range'].rolling(window=atr_period).mean()

In [None]:
# EPS Growth of Nasdaq-100 based on previous quarter
eps_quarterly = pd.read_csv('/content/NasdaqEPS.csv', parse_dates=['Date'], index_col='Date')
eps_quarterly = eps_quarterly[:-2]
eps_quarterly = eps_quarterly.resample('D').ffill()
eps_quarterly = eps_quarterly.loc[setDate:endDate]

# Reset index of both DataFrames
data = data.reset_index()
eps_quarterly_reset = eps_quarterly.reset_index()

# Merge EPS with original dataset
data = data.merge(eps_quarterly_reset, how='left', on='Date')
data.set_index('Date', inplace=True)

# Indicator6: Nasdaq-100 P/E Ratio
ndx = yf.download('^NDX', start=setDate)
data['Nasdaq-100'] = ndx['Close']
data['P/E'] = data['Nasdaq-100'] / data['Value']

[*********************100%***********************]  1 of 1 completed


In [None]:
# Indicator7: S&P 500
sp500 = yf.download('^GSPC', start=setDate)[['Close']]
data['SP500 Close'] = sp500['Close']
data['SP500 Daily Return'] = sp500['Close'].pct_change()

# Indicator8: Volatility of the market VIX
vix = yf.download('^VIX', start=setDate)[['Close']]
data['VIX'] = vix['Close']

# Indicator9: Interest Rate
tnx = yf.download('^TNX', start=setDate)[['Close']]
data['Yield'] = tnx['Close']

# Indicator10: Crude Oil
oil = yf.download('CL=F', start=setDate)[['Close']]
data['Oil'] = oil['Close']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [None]:
# Drop Irrelevant Columns
data.drop(columns=['Nasdaq-100', 'YOY (%)', 'TR1', 'TR2', 'TR3', 'True Range'], inplace=True)
data.rename(columns={'Value': 'EPS'}, inplace=True)

# Shift all data backward 1 row
for col in data.columns[4:]:
    data[col] = data[col].shift(1)
    data.rename(columns={col: f'Prev {col}'}, inplace=True)
for col in data.columns[1:4]:
  data[f'Prev {col}'] = data[col].shift(1)

data = data.dropna()
display(data)

Unnamed: 0_level_0,Open,Close,High,Low,Prev Daily Return,Prev Volatility,Prev SMA_5,Prev SMA_21,Prev EMA_5,Prev EMA_21,...,Prev EPS,Prev P/E,Prev SP500 Close,Prev SP500 Daily Return,Prev VIX,Prev Yield,Prev Oil,Prev Close,Prev High,Prev Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-02,307.570007,311.720001,313.679993,306.730011,0.021384,0.037980,295.082001,280.917140,294.946578,283.585539,...,459.515,26.904670,4119.209961,0.010452,17.870001,3.397,76.410004,300.920013,303.429993,292.299988
2023-02-03,304.940002,306.179993,312.390015,304.540009,0.035890,0.046221,298.758002,283.166664,300.537719,286.143218,...,459.515,27.862289,4179.759766,0.014699,18.730000,3.396,75.879997,311.720001,313.679993,306.730011
2023-02-06,303.510010,303.589996,305.910004,302.220001,-0.017772,0.055449,300.741998,285.092378,302.418477,287.964743,...,459.515,27.362241,4136.479980,-0.010355,18.330000,3.532,73.389999,306.179993,312.390015,304.540009
2023-02-07,303.459991,309.880005,311.029999,302.320007,-0.008459,0.049163,303.406000,287.092855,302.808983,289.385220,...,459.515,27.125360,4111.080078,-0.006140,19.430000,3.634,74.110001,303.589996,305.910004,302.220001
2023-02-08,308.540009,304.369995,309.500000,303.670013,0.020719,0.050326,306.458002,289.049046,305.165991,291.248382,...,459.515,27.699356,4164.000000,0.012873,18.660000,3.674,77.139999,309.880005,311.029999,302.320007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03,513.950012,516.869995,517.150024,513.369995,0.010888,0.016536,509.245990,505.086664,510.002469,504.625032,...,572.833,36.947242,6047.149902,0.002448,13.340000,4.196,68.099998,515.289978,516.260010,510.619995
2024-12-04,520.320007,523.260010,523.520020,519.599976,0.003066,0.016358,511.301990,506.488569,512.291644,505.738211,...,572.833,37.060226,6049.879883,0.000451,13.300000,4.223,69.940002,516.869995,517.150024,513.369995
2024-12-05,523.309998,521.809998,524.039978,521.419983,0.012363,0.018429,514.091992,508.262379,515.947766,507.331101,...,572.833,37.519416,6086.490234,0.006051,13.450000,4.180,68.540001,523.260010,523.520020,519.599976
2024-12-06,522.479980,526.479980,526.719971,522.349976,-0.002771,0.013991,517.393994,509.671903,517.901843,508.647365,...,572.833,37.402211,6075.109863,-0.001870,13.540000,4.180,68.300003,521.809998,524.039978,521.419983


# Trading Strategy:
We have the following assumption for our trading:

1. The trading horizon is 1 day.

We use the following strategy for trading:

1. Buy ETF at open*(1 - 0.1%). We don't trade if this cannot be reached.
2. Sell ETF when the price reach the stop profit limit which is (open + 0.5*previous ATR). Otherwise, we sell at the close price of the day.





In [None]:
# Define function to find enter price and exit price
def calculate_enter_price(group):
  threshold = 0.001
  daily_open = group['Open'].iloc[0]
  group['Enter Market'] = group['Low'] <= daily_open * (1 - threshold)
  filtered = group[group['Enter Market']]
  if filtered.empty:
    return group.iloc[[0]]
  return filtered.head(1) # Return the first matching row

def find_last_profit(group):
    group['Exit Market'] = group['High'] > group['Stop Profit']
    filtered = group[group['Exit Market']]
    if not filtered.empty:
        return filtered.iloc[[-1]]  # Return the last matching row
    return group.iloc[[-1]]

In [None]:
# Fetch hourly data for QQQ
df = yf.download(ticker, start=setDate, interval="1h")

# Step 1: Merge hourly data with stop profit
data['Stop Profit'] = data['Open'] + 0.5*data['Prev ATR']
df['Date'] = df.index.date
df.columns = df.columns.droplevel('Ticker')
df = df.reset_index()
data = data.reset_index()
df['Date'] = pd.to_datetime(df['Date'])
df = pd.merge(df, data[['Date', 'Close', 'Stop Profit']], on='Date', how='right')

[*********************100%***********************]  1 of 1 completed


In [None]:
# Step 2: Find the time of entering market
result1 = df.groupby('Date').apply(calculate_enter_price)
result1.rename(columns={'Datetime': 'Enter Time'}, inplace=True)

# Step 3: Find the time for getting profit limit
# If get to limit then true else false
result2 = df.groupby('Date').apply(find_last_profit)
result2.rename(columns={'Datetime': 'Exit Time', 'Close_y': 'Close'}, inplace=True)

# Step4: Merge result1 and result2 dataset
result1 = result1.reset_index(drop=True)
result2 = result2.reset_index(drop=True)
res = pd.merge(result1[['Date', 'Enter Time', 'Low', 'Enter Market']],
               result2[['Date', 'Exit Time', 'Close', 'Stop Profit', 'Exit Market']],
               on='Date', how='left')

# Step 5: Check if we sell at profit limit or sell at close
res['Sell at Profit'] = (res['Enter Time'] < res['Exit Time']) & (res['Exit Market'] == True)
res = res.drop(columns=['Enter Time', 'Exit Time'])
display(res)

  result1 = df.groupby('Date').apply(calculate_enter_price)
  result2 = df.groupby('Date').apply(find_last_profit)


Unnamed: 0,Date,Low,Enter Market,Close,Stop Profit,Exit Market,Sell at Profit
0,2023-02-02,306.730011,True,311.720001,310.989005,True,True
1,2023-02-03,304.920013,True,306.179993,309.056000,True,True
2,2023-02-06,302.670013,True,303.589996,307.773007,False,False
3,2023-02-07,302.769989,True,309.880005,307.481989,True,True
4,2023-02-08,307.119995,True,304.369995,312.981006,False,False
...,...,...,...,...,...,...,...
461,2024-12-03,513.369995,False,516.869995,516.918018,True,True
462,2024-12-04,519.619995,True,523.260010,522.947015,True,True
463,2024-12-05,522.359985,True,521.809998,526.247006,False,False
464,2024-12-06,522.349976,False,526.479980,524.940988,True,True


# Trading Prediction (Binary Classification)

# Define labeling logic
1. If enter market is false, then true label is 0 -> not enter
2. If enter market is true, then true label is 1 -> enter\
(1). If sell at profit is true, then true label is 1\
(2). If sell at profit is false: close - low > 0, then true label is 1 else 0


In [None]:
# Set label according to the profit limits
def data_label(row):
    if not row['Enter Market']:
        return 0  # Not entering the market
    else:
      if row['Sell at Profit']:
        return 1
      else:
        return 1 if (row['Close'] - row['Low']) > 0 else 0

In [None]:
# Add True Label and Merge with original dataset
res['Target'] = res.apply(data_label, axis=1)
data = pd.merge(data, res[['Date', 'Target']], on='Date', how='left')
data.set_index('Date', inplace=True)
display(data)

Unnamed: 0_level_0,Open,Close,High,Low,Prev Daily Return,Prev Volatility,Prev SMA_5,Prev SMA_21,Prev EMA_5,Prev EMA_21,...,Prev SP500 Close,Prev SP500 Daily Return,Prev VIX,Prev Yield,Prev Oil,Prev Close,Prev High,Prev Low,Stop Profit,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-02,307.570007,311.720001,313.679993,306.730011,0.021384,0.037980,295.082001,280.917140,294.946578,283.585539,...,4119.209961,0.010452,17.870001,3.397,76.410004,300.920013,303.429993,292.299988,310.989005,1
2023-02-03,304.940002,306.179993,312.390015,304.540009,0.035890,0.046221,298.758002,283.166664,300.537719,286.143218,...,4179.759766,0.014699,18.730000,3.396,75.879997,311.720001,313.679993,306.730011,309.056000,1
2023-02-06,303.510010,303.589996,305.910004,302.220001,-0.017772,0.055449,300.741998,285.092378,302.418477,287.964743,...,4136.479980,-0.010355,18.330000,3.532,73.389999,306.179993,312.390015,304.540009,307.773007,1
2023-02-07,303.459991,309.880005,311.029999,302.320007,-0.008459,0.049163,303.406000,287.092855,302.808983,289.385220,...,4111.080078,-0.006140,19.430000,3.634,74.110001,303.589996,305.910004,302.220001,307.481989,1
2023-02-08,308.540009,304.369995,309.500000,303.670013,0.020719,0.050326,306.458002,289.049046,305.165991,291.248382,...,4164.000000,0.012873,18.660000,3.674,77.139999,309.880005,311.029999,302.320007,312.981006,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03,513.950012,516.869995,517.150024,513.369995,0.010888,0.016536,509.245990,505.086664,510.002469,504.625032,...,6047.149902,0.002448,13.340000,4.196,68.099998,515.289978,516.260010,510.619995,516.918018,0
2024-12-04,520.320007,523.260010,523.520020,519.599976,0.003066,0.016358,511.301990,506.488569,512.291644,505.738211,...,6049.879883,0.000451,13.300000,4.223,69.940002,516.869995,517.150024,513.369995,522.947015,1
2024-12-05,523.309998,521.809998,524.039978,521.419983,0.012363,0.018429,514.091992,508.262379,515.947766,507.331101,...,6086.490234,0.006051,13.450000,4.180,68.540001,523.260010,523.520020,519.599976,526.247006,0
2024-12-06,522.479980,526.479980,526.719971,522.349976,-0.002771,0.013991,517.393994,509.671903,517.901843,508.647365,...,6075.109863,-0.001870,13.540000,4.180,68.300003,521.809998,524.039978,521.419983,524.940988,0


In [None]:
X = data.drop(columns=['Target', 'Close', 'Prev ATR', 'Low', 'High'])
y = data['Target']

# Step1: Train-test split 7:3
split = int(len(X)*0.7)
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

# Step2: Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step3: XGBoost Model
num_pos = sum(y == 1)
num_neg = sum(y == 0)

# Compute scale_pos_weight
scale_pos_weight = num_neg / num_pos

# Create the XGBoost classifier
xgb = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=50,
    subsample=0.8,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight)

xgb.fit(X_train, y_train)

In [None]:
# Out-of-sample Performance
y_pred = xgb.predict(X_test)

# Print classification report and accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6428571428571429
              precision    recall  f1-score   support

           0       0.59      0.24      0.34        54
           1       0.65      0.90      0.75        86

    accuracy                           0.64       140
   macro avg       0.62      0.57      0.55       140
weighted avg       0.63      0.64      0.60       140



In [None]:
# In-sample Performance
y_insample = xgb.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, y_insample)}")
print(classification_report(y_train, y_insample))

Accuracy: 0.9754601226993865
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       116
           1       0.97      0.99      0.98       210

    accuracy                           0.98       326
   macro avg       0.98      0.97      0.97       326
weighted avg       0.98      0.98      0.98       326

