In [1]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path("..")
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
ARTIFACTS = PROJECT_ROOT / "artifacts"

df = pd.read_parquet(DATA_PROCESSED / "vinv_inputs_raw.parquet")
df.head()

Unnamed: 0,date,asset_id


In [2]:
# Missingness
miss = (df.isna().mean().sort_values(ascending=False) * 100).to_frame("pct_missing")
miss.to_csv(ARTIFACTS / "missingness_summary.csv")

# Basic stats (numeric)
num = df.select_dtypes(include="number")
stats = num.describe().T
stats.to_csv(ARTIFACTS / "basic_stats.csv")

print("Wrote diagnostics CSVs.")

Wrote diagnostics CSVs.


In [3]:
# Optional: lightweight HTML report (no seaborn)
import matplotlib.pyplot as plt

report_path = ARTIFACTS / "feature_diagnostics.html"

# Simple example: distribution plots for first few numeric columns
cols = list(num.columns[:6])
html_parts = ["<html><body><h1>Pre-Trade Analytics</h1>"]

for c in cols:
    fig = plt.figure()
    plt.hist(num[c].dropna().values, bins=50)
    plt.title(f"Distribution: {c}")
    img_path = ARTIFACTS / f"hist_{c}.png"
    fig.savefig(img_path, bbox_inches="tight")
    plt.close(fig)
    html_parts.append(f"<h3>{c}</h3><img src='{img_path.name}' width='700'/>")

html_parts.append("</body></html>")
report_path.write_text("\n".join(html_parts))

print("Wrote:", report_path)

Wrote: ..\artifacts\feature_diagnostics.html
