# 01 - Data Engineering and Validation
This notebook uses project code from `src/` to download real market data, engineer features, and validate quality.

In [1]:
from mlforecast_realworld.ml.pipeline import ForecastPipeline
pipeline = ForecastPipeline()

In [2]:
training_df = pipeline.prepare_training_data(download=True)
training_df.head()

Unnamed: 0,unique_id,ds,open,high,low,close,volume,y,sector,asset_class,sector_code,asset_class_code,sample_weight,is_weekend,is_month_start,is_month_end,week_of_year,month_sin,month_cos
0,AAPL.US,2015-01-02,24.6921,24.7037,23.8002,24.2374,239959065,24.2374,Technology,equity,3,1,0.332211,0,0,0,1,0.5,0.866025
1,AAPL.US,2015-01-05,24.0036,24.0802,23.37,23.5555,289848502,23.5555,Technology,equity,3,1,0.40128,0,0,0,2,0.5,0.866025
2,AAPL.US,2015-01-06,23.6461,23.816,23.1942,23.5575,296598610,23.5575,Technology,equity,3,1,0.410625,0,0,0,2,0.5,0.866025
3,AAPL.US,2015-01-07,23.7688,23.982,23.651,23.8917,180725843,23.8917,Technology,equity,3,1,0.250205,0,0,0,2,0.5,0.866025
4,AAPL.US,2015-01-08,24.2155,24.8669,24.0929,24.8079,267640572,24.8079,Technology,equity,3,1,0.370534,0,0,0,2,0.5,0.866025


In [3]:
report = pipeline.data_engineer.quality_report(training_df)
report

DataQualityReport(rows=13980, series=5, start=Timestamp('2015-01-02 00:00:00'), end=Timestamp('2026-02-13 00:00:00'), missing_rate=0.0)

In [4]:
static_df = pipeline.data_engineer.build_static_features(training_df)
static_df

Unnamed: 0,unique_id,sector,asset_class,sector_code,asset_class_code
0,AAPL.US,Technology,equity,3,1
1,AMZN.US,Consumer Discretionary,equity,2,1
2,GOOG.US,Communication Services,equity,1,1
3,META.US,Communication Services,equity,1,1
4,MSFT.US,Technology,equity,3,1


In [5]:
future_x = pipeline.data_engineer.build_future_exogenous(
    ids=sorted(training_df['unique_id'].unique().tolist()),
    last_timestamp=training_df['ds'].max(),
    horizon=7,
    freq=pipeline.settings.forecast.freq,
)
future_x.head()

Unnamed: 0,unique_id,ds,is_weekend,is_month_start,is_month_end,week_of_year,month_sin,month_cos
0,AAPL.US,2026-02-16,0,0,0,8,0.866025,0.5
1,AAPL.US,2026-02-17,0,0,0,8,0.866025,0.5
2,AAPL.US,2026-02-18,0,0,0,8,0.866025,0.5
3,AAPL.US,2026-02-19,0,0,0,8,0.866025,0.5
4,AAPL.US,2026-02-20,0,0,0,8,0.866025,0.5
