In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array

import pandas as pd
import mlflow
import mlflow.spark 


In [0]:
# High-level experiment parameters
cutoff_date = "2020-01-01"   # train/test split
lookahead_days = 5           # matches 5-day lookahead in Notebook 1
long_threshold = 0.55        # P(up) threshold for long
short_threshold = 0.45       # P(up) threshold for short


In [0]:
df = spark.table("market.features_labeled")
display(df.limit(5))

In [0]:
feature_cols = ["sma_20", "std_20", "daily_return"]

df_clean = df.dropna(subset=feature_cols + ["label"])

display(
    df_clean.select(
        "Date", "symbol", "Close", "sma_20", "std_20", "daily_return", "label"
    ).limit(10)
)

In [0]:
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

dataset = (
    assembler
    .transform(df_clean)
    .select("Date", "symbol", "features", "label", "daily_return")
)

display(dataset.limit(10))


In [0]:
train = dataset.filter(F.col("Date") < cutoff_date)
test  = dataset.filter(F.col("Date") >= cutoff_date)

print("Train rows:", train.count())
print("Test rows:", test.count())


In [0]:
lr = LogisticRegression(
    labelCol="label",
    featuresCol="features"
)

model = lr.fit(train)

print("Intercept:", model.intercept)
print("Coefficients:", model.coefficients)

In [0]:
preds = model.transform(test)

preds_sel = preds.select(
    "Date",
    "symbol",
    "label",
    "prediction",
    "probability",
    "daily_return"
)

display(preds_sel.limit(10))

In [0]:
strategy_df = (
    preds_sel
    # convert vector -> array so we can index it
    .withColumn("prob_array", vector_to_array("probability"))
    .withColumn("p_up", F.col("prob_array")[1])  # P(class = 1, i.e., up)
    .withColumn(
        "position",
        F.when(F.col("p_up") > long_threshold, 1)
         .when(F.col("p_up") < short_threshold, -1)
         .otherwise(0)
    )
    .withColumn("strategy_return", F.col("position") * F.col("daily_return"))
)

display(strategy_df.limit(10))



In [0]:
portfolio_df = (
    strategy_df
    .groupBy("Date")
    .agg(
        F.avg("strategy_return").alias("portfolio_return"),
        F.avg("p_up").alias("avg_p_up"),
        F.avg("position").alias("avg_position")
    )
    .orderBy("Date")
)

display(portfolio_df.limit(10))

In [0]:
w_date = Window.orderBy("Date").rowsBetween(Window.unboundedPreceding, 0)

portfolio_with_cum = (
    portfolio_df
    .withColumn(
        "cum_log_return",
        F.sum(F.log(1 + F.col("portfolio_return"))).over(w_date)
    )
    .withColumn("cum_return", F.exp(F.col("cum_log_return")) - 1)
)

display(portfolio_with_cum.limit(10))

In [0]:
pdf = portfolio_with_cum.select(
    "Date", "portfolio_return", "cum_return"
).toPandas()

mean_ret = pdf["portfolio_return"].mean()
std_ret = pdf["portfolio_return"].std()

if std_ret and std_ret != 0:
    sharpe = (mean_ret / std_ret) * (252 ** 0.5)
else:
    sharpe = None

print("Mean daily return:", mean_ret)
print("Std daily return:", std_ret)
print("Annualized Sharpe ratio:", sharpe)

In [0]:
with mlflow.start_run():
    mlflow.log_param("lookahead_days", lookahead_days)
    mlflow.log_param("cutoff_date", cutoff_date)
    mlflow.log_param("long_threshold", long_threshold)
    mlflow.log_param("short_threshold", short_threshold)
    mlflow.log_param("features", ",".join(feature_cols))

    if sharpe is not None:
        mlflow.log_metric("sharpe_ratio", float(sharpe))
        mlflow.log_metric("mean_daily_return", float(mean_ret))
        mlflow.log_metric("std_daily_return", float(std_ret))

    # NOTE: On free/serverless we skip logging the Spark model artifact:
    # mlflow.spark.log_model(model, "model")



In [0]:
# Build buy-and-hold benchmark (equal-weighted)
benchmark_df = (
    df_clean
    .filter(F.col("Date") >= cutoff_date)
    .groupBy("Date")
    .agg(
        F.avg("daily_return").alias("benchmark_return")
    )
    .orderBy("Date")
)

# Cumulative benchmark returns
w_date = Window.orderBy("Date").rowsBetween(Window.unboundedPreceding, 0)

benchmark_with_cum = (
    benchmark_df
    .withColumn(
        "cum_log_return",
        F.sum(F.log(1 + F.col("benchmark_return"))).over(w_date)
    )
    .withColumn(
        "benchmark_cum_return",
        F.exp(F.col("cum_log_return")) - 1
    )
)

# Join ML strategy vs benchmark
comparison_df = (
    portfolio_with_cum
    .select("Date", "cum_return", "portfolio_return")
    .join(
        benchmark_with_cum.select(
            "Date", "benchmark_cum_return", "benchmark_return"
        ),
        on="Date",
        how="inner"
    )
    .orderBy("Date")
)

display(comparison_df.limit(10))


In [0]:
pdf_compare = comparison_df.toPandas()


In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(pdf_compare["Date"], pdf_compare["cum_return"], label="ML Strategy")
plt.plot(pdf_compare["Date"], pdf_compare["benchmark_cum_return"], label="Buy & Hold")
plt.legend()
plt.title("ML Strategy vs Buy-and-Hold Benchmark")
plt.xlabel("Date")
plt.ylabel("Cumulative Return")
plt.grid(True)
plt.show()


In [0]:
plt.figure(figsize=(12, 4))
plt.plot(pdf_compare["Date"], pdf_compare["portfolio_return"], alpha=0.6, label="ML Strategy")
plt.plot(pdf_compare["Date"], pdf_compare["benchmark_return"], alpha=0.6, label="Buy & Hold")
plt.legend()
plt.title("Daily Returns Comparison")
plt.xlabel("Date")
plt.ylabel("Daily Return")
plt.grid(True)
plt.show()


In [0]:
window = 63  # ~3 months

pdf_compare["ml_rolling_sharpe"] = (
    pdf_compare["portfolio_return"]
    .rolling(window)
    .mean()
    / pdf_compare["portfolio_return"].rolling(window).std()
) * (252 ** 0.5)

pdf_compare["benchmark_rolling_sharpe"] = (
    pdf_compare["benchmark_return"]
    .rolling(window)
    .mean()
    / pdf_compare["benchmark_return"].rolling(window).std()
) * (252 ** 0.5)

plt.figure(figsize=(12, 5))
plt.plot(pdf_compare["Date"], pdf_compare["ml_rolling_sharpe"], label="ML Strategy")
plt.plot(pdf_compare["Date"], pdf_compare["benchmark_rolling_sharpe"], label="Buy & Hold")
plt.legend()
plt.title("Rolling Sharpe Ratio (63-day window)")
plt.xlabel("Date")
plt.ylabel("Sharpe Ratio")
plt.grid(True)
plt.show()
