In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array

import pandas as pd
import matplotlib.pyplot as plt

In [0]:
# High-level experiment parameters
cutoff_date = "2020-01-01"   # train/test split date
long_threshold = 0.55        # P(up) threshold for long
short_threshold = 0.45       # P(up) threshold for short

feature_cols = ["sma_20", "std_20", "daily_return"]

# Load labeled features created in Notebook 1
df = spark.table("market.features_labeled")

# Drop rows with missing features/labels
df_clean = df.dropna(subset=feature_cols + ["label"])

display(
    df_clean.select(
        "Date", "symbol", "Close", "sma_20", "std_20", "daily_return", "label"
    ).limit(10)
)

# Train/test split
train = df_clean.filter(F.col("Date") < cutoff_date)
test = df_clean.filter(F.col("Date") >= cutoff_date)

print(f"Train rows: {train.count():,}")
print(f"Test rows:  {test.count():,}")



In [0]:
def compute_sharpe(pdf: pd.DataFrame, ret_col: str = "portfolio_return") -> float | None:
    """
    Compute annualized Sharpe ratio from a pandas DataFrame with a return column.
    Assumes daily returns.
    """
    mean_ret = pdf[ret_col].mean()
    std_ret = pdf[ret_col].std()

    if std_ret and std_ret != 0:
        return (mean_ret / std_ret) * (252 ** 0.5)
    return None


In [0]:
# Build feature vector
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# Logistic regression model
lr = LogisticRegression(
    labelCol="label",
    featuresCol="features",
    regParam=0.0,
    elasticNetParam=0.0,
    maxIter=50
)

# Fit model
dataset = assembler.transform(df_clean)

train = dataset.filter(F.col("Date") < cutoff_date)
test = dataset.filter(F.col("Date") >= cutoff_date)

model = lr.fit(train)

# Predictions on test set
preds = model.transform(test)

preds_sel = preds.select(
    "Date",
    "symbol",
    "label",
    "prediction",
    "probability",
    "daily_return"
)

display(preds_sel.limit(10))



In [0]:
# Build trading strategy based on predicted probability of "up" (class 1)
strategy_df = (
    preds_sel
    # convert vector -> array so we can index it
    .withColumn("prob_array", vector_to_array("probability"))
    .withColumn("p_up", F.col("prob_array")[1])  # P(class = 1)
    .withColumn(
        "position",
        F.when(F.col("p_up") > long_threshold, 1)
         .when(F.col("p_up") < short_threshold, -1)
         .otherwise(0)
    )
    .withColumn("strategy_return", F.col("position") * F.col("daily_return"))
)

display(strategy_df.limit(10))

# Aggregate to daily portfolio returns
portfolio_df = (
    strategy_df
    .groupBy("Date")
    .agg(
        F.avg("strategy_return").alias("portfolio_return")
    )
    .orderBy("Date")
)

# Build equal-weight buy-and-hold benchmark using daily_return
benchmark_df = (
    df_clean
    .groupBy("Date")
    .agg(F.avg("daily_return").alias("benchmark_return"))
    .orderBy("Date")
)

# Combine strategy and benchmark, compute cumulative returns
comparison_df = (
    portfolio_df.alias("p")
    .join(benchmark_df.alias("b"), "Date", "inner")
    .select(
        "Date",
        "portfolio_return",
        "benchmark_return",
    )
)

w_date = Window.orderBy("Date").rowsBetween(Window.unboundedPreceding, 0)

comparison_df = (
    comparison_df
    .withColumn("cml_return", F.sum("portfolio_return").over(w_date))
    .withColumn("benchmark_cml_return", F.sum("benchmark_return").over(w_date))
)

pdf_compare = comparison_df.toPandas()

# Compute Sharpe ratios
ml_sharpe = compute_sharpe(pdf_compare, ret_col="portfolio_return")
benchmark_sharpe = compute_sharpe(pdf_compare, ret_col="benchmark_return")

print("Strategy Sharpe:", ml_sharpe)
print("Benchmark Sharpe:", benchmark_sharpe)

# Plot cumulative returns
plt.figure(figsize=(12, 6))
plt.plot(pdf_compare["Date"], pdf_compare["cml_return"], label="ML strategy")
plt.plot(pdf_compare["Date"], pdf_compare["benchmark_cml_return"], label="Benchmark")
plt.legend()
plt.title("Cumulative returns: ML strategy vs. benchmark")
plt.tight_layout()
plt.show()

# Plot rolling 3-month Sharpe for the strategy
window = 63  # ~3 months of trading days
pdf_compare["ml_rolling_sharpe"] = (
    pdf_compare["portfolio_return"]
    .rolling(window)
    .apply(lambda x: compute_sharpe(pd.DataFrame({"r": x}), ret_col="r") or 0.0)
)

plt.figure(figsize=(12, 4))
plt.plot(pdf_compare["Date"], pdf_compare["ml_rolling_sharpe"])
plt.title("Rolling 3-month Sharpe (ML strategy)")
plt.tight_layout()
plt.show()
