In [None]:
import joblib
import matplotlib.pyplot as plt
import os 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Load data and engineer features

In [None]:
# load data
data = pd.read_csv('~/projects/QUSA/data/processed/AMZN_processed.csv')

# define target variable: overnight price change direction 
data['overnight_delta_positive'] = (data['overnight_delta'] > 0).astype(int)

# drop unknown overnight deltas
data = data.dropna(subset=['overnight_delta'])

# drop potential confounds 
confounds = [
    'overnight_delta',
    'overnight_delta_pct',
    'date',
    'z_score',
    'intraday_return_strong_positive',
    'intraday_return_strong_negative',
]
data = data.drop(columns=confounds, errors='ignore')

# further leakage protection 
safe_features = [
    "rsi",
    "volume_ratio",
    "atr_pct",
    "52_week_high_proximity",
    "52_week_low_proximity",
    "day_of_week",
    "first_5d_month",
    "final_5d_month",
    "is_monday",
    'is_tuesday', 
    'is_wednesday', 
    'is_thursday', 
    'is_friday',
    'month_of_year',
    'is_jan', 
    'is_feb', 
    'is_mar', 
    'is_apr', 
    'is_may',
    'is_jun', 
    'is_jul', 
    'is_aug', 
    'is_sep', 
    'is_oct', 
    'is_nov', 
    'is_dec',
    'day_of_month', 
    'first_5d_month', 
    'final_5d_month',
    "is_friday",
    "month_of_year"
]
X = data[safe_features].fillna(0)
y = data["overnight_delta_positive"]


# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    shuffle=False
)

# âœ… CLASS BALANCE CHECK (IMPORTANT)
print("\nClass distribution (train):")
print(y_train.value_counts(normalize=True))

print("\nClass distribution (test):")
print(y_test.value_counts(normalize=True))


## Train model and present metrics 

In [None]:
# define decision tree model
model = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=20,
    min_samples_split=40,
    class_weight="balanced",   
    random_state=42
)

# fit model
model.fit(X_train, y_train)

# make predictions
preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))

importances = pd.Series(model.feature_importances_, index=X.columns)\
    .sort_values(ascending=False)

print("\nFeature Importance:")
print(importances.loc[importances.values>0.0])
print(f"Total features used: {len(importances.loc[importances.values>0.0])}")

## Save model 

In [None]:
# save model 
path = os.path.expanduser("~/projects/QUSA/models/logic_model.pkl")

joblib.dump(
    {
        "model": model,
        "features": safe_features
    },
    path
)

# load model example 
bundle = joblib.load(path)
model = bundle["model"]
safe_features = bundle["features"]


## Visualize trained model 

In [None]:
plt.figure(figsize=(20,10))
plot_tree(model, feature_names=X.columns, class_names=["Down", "Up"], filled=True)
plt.show()