In [0]:
%pip install yfinance



In [0]:
import yfinance as yf
import pandas as pd
from pyspark.sql.window import Window
import pyspark.sql.functions as F

symbols = ["SPY", "AAPL", "MSFT"]
start_date = "2015-01-01"
end_date = "2023-01-01"


In [0]:
raw = yf.download(
    symbols,
    start=start_date,
    end=end_date,
    group_by="ticker",
    threads=False,
    progress=False
)

display(raw.head())


In [0]:
tickers = raw.columns.get_level_values(0).unique()

prices = []
for sym in tickers:
    tmp = raw[sym].reset_index()
    tmp["symbol"] = sym
    prices.append(tmp)

pdf = pd.concat(prices, ignore_index=True)
pdf = pdf.rename(columns={"Adj Close": "Adj_Close"})

display(pdf.head())


In [0]:
spark_df = spark.createDataFrame(pdf)
spark.sql("CREATE SCHEMA IF NOT EXISTS market")

(
    spark_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("market.prices")
)

display(spark.table("market.prices").limit(5))


In [0]:
prices_sdf = spark.table("market.prices")

w20 = Window.partitionBy("symbol").orderBy("Date").rowsBetween(-19, 0)
w_by_symbol = Window.partitionBy("symbol").orderBy("Date")

features_df = (
    prices_sdf
    .withColumn("sma_20", F.avg("Close").over(w20))
    .withColumn("std_20", F.stddev("Close").over(w20))
    .withColumn("bb_upper", F.col("sma_20") + 2 * F.col("std_20"))
    .withColumn("bb_lower", F.col("sma_20") - 2 * F.col("std_20"))
    .withColumn("daily_return", F.log(F.col("Close") / F.lag("Close").over(w_by_symbol)))
)

display(features_df.limit(10))


In [0]:
features_labeled_df = (
    features_df
    .withColumn("future_price_5d", F.lead("Close", 5).over(w_by_symbol))
    .withColumn("future_return_5d", (F.col("future_price_5d") / F.col("Close")) - 1)
    .withColumn("label", (F.col("future_return_5d") > 0).cast("int"))
)

display(features_labeled_df.limit(10))


In [0]:
(
    features_labeled_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("market.features_labeled")
)

display(spark.table("market.features_labeled").limit(10))
