1. Downloads data from yfinance

2. Performs basic QC

3. Creates features

4. Creates a forward return label

5. Builds one final panel table

6. Prints the output as a table

In [None]:
"""
Quant Data Pipeline Example
------------------------------------
Goal:
1. Download NVDA daily data
2. Perform basic data quality checks
3. Create features (moving averages)
4. Create forward return label
5. Build one final modeling table
"""



import pandas as pd
import numpy as np
import pandas_datareader.data as web
from datetime import datetime

# ======================================
# Step 1: Download Data
# ======================================

import pandas as pd
import numpy as np
import pandas_datareader.data as web
from datetime import datetime

ticker = "NVDA.US"  # Stooq format
start = datetime(2022, 1, 1)
end = datetime.today()

# Download historical daily price data from Stooq
# Arguments:
#   ticker  -> which asset to download
#   "stooq" -> data source
#   start   -> beginning date
#   end     -> ending date
df = web.DataReader(ticker, "stooq", start, end)

# Stooq returns data in reverse chronological order (newest first)
# For time-series analysis (returns, rolling mean, etc.),
# data must be sorted from oldest to newest.
# Convert the date index into a normal column
# This makes the dataset easier to merge, store, and manipulate.
df = df.sort_index().reset_index()

# Standardize column names:
# 1. Convert everything to lowercase
# 2. Replace spaces with underscores
# This avoids future bugs when referencing column names.
df.columns = [str(c).lower().replace(" ", "_") for c in df.columns]

print(df.head())




        date     open     high      low    close     volume
0 2022-01-03  29.7655  30.6604  29.7357  30.0711  393055570
1 2022-01-04  30.2269  30.4177  28.3019  29.2413  528031140
2 2022-01-05  28.9008  29.3673  27.4869  27.5577  498892235
3 2022-01-06  27.5937  28.3908  27.0197  28.1310  454941757
4 2022-01-07  28.0941  28.3748  27.0117  27.2014  410620306


In [16]:

# ======================================
# Step 2: Basic Data Quality Checks
# ======================================

# Check duplicates
duplicates = df.duplicated(subset=["date"]).sum()

# Check invalid prices
invalid_prices = (df[["open","high","low","close"]] <= 0).any(axis=1).sum()

print("Duplicate rows:", duplicates)
print("Invalid price rows:", invalid_prices)

Duplicate rows: 0
Invalid price rows: 0


In [17]:

# ======================================
# Step 3: Feature Engineering (X)
# ======================================

# Daily return
df["ret_1d"] = df["close"].pct_change()

# Moving averages
df["ma_5"] = df["close"].rolling(5).mean()
df["ma_20"] = df["close"].rolling(20).mean()

# Simple momentum signal
df["signal"] = np.where(df["ma_5"] > df["ma_20"], 1, -1)


In [19]:

# ======================================
# Step 4: Create Label (Y)
# ======================================

# Forward 5-day return
df["ret_fwd_5d"] = df["close"].shift(-5) / df["close"] - 1


In [20]:

# ======================================
# Step 5: Define as-of time
# ======================================

# We assume signal is known only after market close,
# so it can be used next trading day
df["asof_time"] = df["date"] + pd.Timedelta(days=1)
df["label_time"] = df["asof_time"] + pd.Timedelta(days=5)

In [21]:

# ======================================
# Step 6: Final Modeling Table
# ======================================

panel = df[[
    "asof_time",
    "label_time",
    "date",
    "close",
    "ret_1d",
    "ma_5",
    "ma_20",
    "signal",
    "ret_fwd_5d"
]].dropna()

# Print first rows
print("\nFinal Modeling Table (Head):")
print(panel.head())


Final Modeling Table (Head):
    asof_time label_time       date    close    ret_1d      ma_5      ma_20  \
19 2022-02-01 2022-02-06 2022-01-31  24.4455  0.072086  22.83510  25.727325   
20 2022-02-02 2022-02-07 2022-02-01  24.5973  0.006210  23.29718  25.453635   
21 2022-02-03 2022-02-08 2022-02-02  25.2003  0.024515  23.79046  25.251585   
22 2022-02-04 2022-02-09 2022-02-03  23.9083 -0.051269  24.19064  25.069115   
23 2022-02-05 2022-02-10 2022-02-04  24.2789  0.015501  24.48606  24.876510   

    signal  ret_fwd_5d  
19      -1    0.009887  
20      -1    0.019075  
21      -1    0.057936  
22      -1    0.078345  
23      -1   -0.015223  


In [22]:

# ======================================
# Step 7: Simple Strategy Evaluation
# ======================================

panel["strategy_ret"] = panel["signal"] * panel["ret_fwd_5d"]
panel["cum_ret"] = (1 + panel["strategy_ret"]).cumprod()

print("\nFinal Cumulative Return:", round(panel["cum_ret"].iloc[-1], 4))


Final Cumulative Return: 0.6712
