In [1]:
import s3fs
import pandas as pd

s3_path = "s3://bucket2-curated-stock/curated/stock_market/parquet/stock_clean.parquet"

df = pd.read_parquet(s3_path, engine="pyarrow")
df.head()

Unnamed: 0,date,symbol,open,high,low,close,volume,range,daily_return_pct
0,2024-01-01,AAPL,154.72,161.22,151.89,159.55,3494467,9.33,3.121768
1,2024-01-03,AAPL,142.91,144.99,138.4,141.21,10960425,6.59,-1.18956
2,2024-01-04,AAPL,183.98,189.54,182.29,186.8,6915432,7.25,1.532775
3,2024-01-14,AAPL,357.31,361.14,349.48,350.06,7535315,11.66,-2.02905
4,2024-01-15,AAPL,102.33,108.73,98.64,107.0,11891970,10.09,4.563667


In [2]:
print(df.shape)
print(df.columns)
print(df.dtypes)
df.describe()

(1000, 9)
Index(['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'range',
       'daily_return_pct'],
      dtype='object')
date                datetime64[ns]
symbol                      object
open                       float64
high                       float64
low                        float64
close                      float64
volume                       int64
range                      float64
daily_return_pct           float64
dtype: object


Unnamed: 0,date,open,high,low,close,volume,range,daily_return_pct
count,1000,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2025-05-14 12:00:00,247.90785,251.89323,243.76714,247.76473,6494124.0,8.12609,-0.031238
min,2024-01-01 00:00:00,100.12,100.93,90.02,94.01,1013200.0,0.64,-7.122146
25%,2024-09-06 18:00:00,173.7275,176.3925,168.85,172.075,3728256.0,5.9475,-1.661451
50%,2025-05-14 12:00:00,246.75,249.52,242.49,245.55,6557693.0,8.065,-0.075454
75%,2026-01-19 06:00:00,323.47,326.925,318.6,322.35,9222488.0,10.4225,1.519265
max,2026-09-26 00:00:00,399.71,410.98,399.47,407.11,11997040.0,15.33,7.751938
std,,86.487015,86.422846,86.557661,86.444501,3185517.0,2.909954,2.410493


In [5]:
import pandas as pd
import numpy as np

# Sort properly
df = df.sort_values(['symbol', 'date'])

# Create next-day close per symbol
df['next_close'] = df.groupby('symbol')['close'].shift(-1)

# Create target (1 = up, 0 = down)
df['target'] = (df['next_close'] > df['close']).astype(int)

df.dropna(inplace=True)

In [6]:
for lag in [1, 2, 3, 5]:
    df[f'close_lag_{lag}'] = df.groupby('symbol')['close'].shift(lag)
    df[f'return_lag_{lag}'] = df.groupby('symbol')['daily_return_pct'].shift(lag)

In [7]:
for window in [5, 10, 20]:
    df[f'ma_{window}'] = df.groupby('symbol')['close'].transform(
        lambda x: x.rolling(window).mean()
    )

In [8]:
df.dropna(inplace=True)

In [9]:
features = [
    'open', 'high', 'low', 'close', 'volume',
    'range', 'daily_return_pct',
    'close_lag_1', 'close_lag_2', 'close_lag_3', 'close_lag_5',
    'return_lag_1', 'return_lag_2', 'return_lag_3', 'return_lag_5',
    'ma_5', 'ma_10', 'ma_20'
]

X = df[features]
y = df['target']

In [10]:
train_size = int(len(df) * 0.8)

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_test = X.iloc[train_size:]
y_test = y.iloc[train_size:]

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Accuracy: 0.717391304347826
              precision    recall  f1-score   support

           0       0.74      0.71      0.72        95
           1       0.70      0.73      0.71        89

    accuracy                           0.72       184
   macro avg       0.72      0.72      0.72       184
weighted avg       0.72      0.72      0.72       184



In [12]:
test_df = df.iloc[train_size:].copy()
test_df['prediction'] = preds

# Strategy return
test_df['strategy_return'] = test_df['daily_return_pct'] * test_df['prediction']

print("Total Strategy Return:",
      test_df['strategy_return'].sum())

Total Strategy Return: -42.158894794149816


In [13]:
import joblib
import os

model_dir = "model"
os.makedirs(model_dir, exist_ok=True)

model_path = os.path.join(model_dir, "xgb_stock_model.joblib")

joblib.dump(model, model_path)

print("Model saved at:", model_path)

Model saved at: model/xgb_stock_model.joblib


In [14]:
import boto3

s3 = boto3.client("s3")

bucket = "bucket2-curated-stock"
s3_key = "models/xgb_stock_model.joblib"

s3.upload_file(model_path, bucket, s3_key)

print("Uploaded to:", f"s3://{bucket}/{s3_key}")

Uploaded to: s3://bucket2-curated-stock/models/xgb_stock_model.joblib


In [15]:
model.save_model("xgb_model.json")

In [16]:
import boto3

s3 = boto3.client("s3")
bucket = "bucket2-curated-stock"
s3.upload_file("xgb_model.json", bucket, "models/xgb_model.json")