In [8]:
import joblib
import matplotlib.pyplot as plt
import numpy as np 
import os 
import pandas as pd

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree


## Load data and engineer features

In [3]:
# load data
data = pd.read_csv('~/projects/QUSA/data/processed/AMZN_processed.csv')

# define target variable: overnight price change direction 
data['overnight_delta_positive'] = (data['overnight_delta'] > 0).astype(int)

# drop unknown overnight deltas
data = data.dropna(subset=['overnight_delta'])

# drop potential confounds 
confounds = [
    'overnight_delta',
    'overnight_delta_pct',
    'date',
    'z_score',
    'intraday_return_strong_positive',
    'intraday_return_strong_negative',
]
data = data.drop(columns=confounds, errors='ignore')

# further leakage protection 
safe_features = [
    "rsi",
    "volume_ratio",
    "atr_pct",
    "52_week_high_proximity",
    "52_week_low_proximity",
    "day_of_week",
    "first_5d_month",
    "final_5d_month",
    "is_monday",
    'is_tuesday', 
    'is_wednesday', 
    'is_thursday', 
    'is_friday',
    'month_of_year',
    'is_jan', 
    'is_feb', 
    'is_mar', 
    'is_apr', 
    'is_may',
    'is_jun', 
    'is_jul', 
    'is_aug', 
    'is_sep', 
    'is_oct', 
    'is_nov', 
    'is_dec',
    'day_of_month', 
    'first_5d_month', 
    'final_5d_month',
    "month_of_year"
]
safe_features = list(dict.fromkeys(safe_features))

X = data[safe_features].fillna(0)
y = data["overnight_delta_positive"]


# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    shuffle=False
)

# class balance check 
print("\nClass distribution (train):")
print(y_train.value_counts(normalize=True))

print("\nClass distribution (test):")
print(y_test.value_counts(normalize=True))



Class distribution (train):
overnight_delta_positive
1    0.57027
0    0.42973
Name: proportion, dtype: float64

Class distribution (test):
overnight_delta_positive
1    0.612903
0    0.387097
Name: proportion, dtype: float64


## Train model

In [20]:
# define decision tree model
model = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=20,
    min_samples_split=40,
    class_weight="balanced",   
    random_state=42
)

calibrated_model = CalibratedClassifierCV(
    model,
    method="sigmoid",   # Platt scaling
    cv=5
)

# fit model (hard predictions)
model.fit(X_train, y_train)

# test model (hard, soft predictions)
preds = model.predict(X_test)
probs = model.predict_proba(X_test)
p_up = probs[:, 1]



results = X_test.copy()
results["y_true"] = y_test.values
results["pred"] = preds
results["p_up"] = p_up

## Model metrics

In [22]:
print("\nAccuracy:", accuracy_score(y_test, preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds))
print("\nClassification Report:")
print(classification_report(y_test, preds))

importances = (
    pd.Series(model.feature_importances_, index=X.columns)
      .sort_values(ascending=False)
)

print("\nFeature Importance (used by tree):")
print(importances[importances > 0])
print(f"Total features used: {(importances > 0).sum()}")


Accuracy: 0.5403225806451613

Confusion Matrix:
[[14 34]
 [23 53]]

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.29      0.33        48
           1       0.61      0.70      0.65        76

    accuracy                           0.54       124
   macro avg       0.49      0.49      0.49       124
weighted avg       0.52      0.54      0.53       124


Feature Importance (used by tree):
rsi                       0.267653
day_of_week               0.204145
atr_pct                   0.180091
52_week_high_proximity    0.123850
volume_ratio              0.104323
day_of_month              0.096232
52_week_low_proximity     0.023705
dtype: float64
Total features used: 7


## Analyze probabilities 

In [24]:
results = X_test.copy()
results["y_true"] = y_test.values
results["pred"] = preds
results["p_up"] = p_up

THRESHOLD = 0.60

confident = results["p_up"] >= THRESHOLD

print("\n--- Probability Filter ---")
print("Coverage:", confident.mean())
print(
    "Accuracy (confident only):",
    accuracy_score(
        results.loc[confident, "y_true"],
        (results.loc[confident, "p_up"] > 0.5).astype(int)
    )
)

# ---- Probability bins ----
results["prob_bin"] = pd.cut(
    results["p_up"],
    bins=[0.0, 0.4, 0.5, 0.6, 0.7, 1.0]
)

prob_summary = (
    results
    .groupby("prob_bin")
    .agg(
        count=("y_true", "size"),
        win_rate=("y_true", "mean"),
        avg_prob=("p_up", "mean")
    )
)

print("\n--- Probability Calibration ---")
print(prob_summary)


--- Probability Filter ---
Coverage: 0.0967741935483871
Accuracy (confident only): 0.5833333333333334

--- Probability Calibration ---
            count  win_rate  avg_prob
prob_bin                             
(0.0, 0.4]     28  0.535714  0.280398
(0.4, 0.5]      9  0.888889  0.451170
(0.5, 0.6]     75  0.613333  0.545639
(0.6, 0.7]      0       NaN       NaN
(0.7, 1.0]     12  0.583333  0.779298


  .groupby("prob_bin")


## Save model 

In [7]:
# save model 
path = os.path.expanduser("~/projects/QUSA/models/logic_model.pkl")

joblib.dump(
    {
        "model": model,
        "features": safe_features,
        "threshold": THRESHOLD,
        "target": "overnight_delta_positive"
    },
    path
)

# load model example 
bundle = joblib.load(path)
model = bundle["model"]
safe_features = bundle["features"]

print(f"\nModel saved to: {path}")


Model saved to: /Users/sidrana/projects/QUSA/models/logic_model.pkl


## Visualize trained model 

In [None]:
plt.figure(figsize=(20,10))
plot_tree(model, feature_names=X.columns, class_names=["Down", "Up"], filled=True)
plt.show()