In [48]:
import yfinance as yf
import pandas as pd

In [49]:
df = yf.download(tickers="AAPL", start="2025-12-01")
df.columns = df.columns.get_level_values(0)
df.head(10)

  df = yf.download(tickers="AAPL", start="2025-12-01")
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-12-01,283.100006,283.420013,276.140015,278.01001,46587700
2025-12-02,286.190002,287.399994,282.630005,283.0,53669500
2025-12-03,284.149994,288.619995,283.299988,286.200012,43538700
2025-12-04,280.700012,284.730011,278.589996,284.100006,43989100
2025-12-05,278.779999,281.140015,278.049988,280.540009,47265800
2025-12-08,277.890015,279.670013,276.149994,278.130005,38211800
2025-12-09,277.179993,280.029999,276.920013,278.160004,32193300
2025-12-10,278.779999,279.75,276.440002,277.75,33038300
2025-12-11,278.029999,279.589996,273.809998,279.100006,33248000
2025-12-12,278.279999,279.220001,276.820007,277.899994,39532900


In [50]:
tickers='AAPL'
df.to_csv(f'../data/{tickers}.csv')

In [51]:
df.isna().sum()

Price
Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [52]:
df.index.name

'Date'

In [53]:
df.columns

Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')

In [54]:
import pandas as pd
import numpy as np


def add_price_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Returns
    df["return"] = df["Close"].pct_change()
    df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))

    return df


def add_lag_features(df: pd.DataFrame, lags=(1, 3)) -> pd.DataFrame:
    df = df.copy()

    for lag in lags:
        df[f"return_lag_{lag}"] = df["log_return"].shift(lag)
        df[f"Close_lag_{lag}"] = df["Close"].shift(lag)

    return df


def add_rolling_features(df: pd.DataFrame, windows=(2, 3)) -> pd.DataFrame:
    df = df.copy()

    for window in windows:
        df[f"rolling_mean_{window}"] = df["log_return"].rolling(window).mean()
        df[f"rolling_std_{window}"] = df["log_return"].rolling(window).std()

    return df


def add_moving_averages(df: pd.DataFrame, windows=(3, 10)) -> pd.DataFrame:
    df = df.copy()

    for window in windows:
        df[f"sma_{window}"] = df["Close"].rolling(window).mean()
        df[f"ema_{window}"] = df["Close"].ewm(span=window, adjust=False).mean()

    return df


def add_volume_features(df: pd.DataFrame, window=10) -> pd.DataFrame:
    df = df.copy()

    df["volume_change"] = df["Volume"].pct_change()
    df[f"volume_rolling_mean_{window}"] = df["Volume"].rolling(window).mean()

    return df


def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["day_of_week"] = df.index.dayofweek
    df["week_of_year"] = df.index.isocalendar().week.astype(int)
    df["month"] = df.index.month
    df["is_month_end"] = df.index.is_month_end.astype(int)

    return df


#def add_target(df: pd.DataFrame, horizon=1) -> pd.DataFrame:
#    """
#    Regression target: next-day log return
#    """
#    df = df.copy()
#    df["target_return"] = df["log_return"].shift(-horizon)
#    return df


def build_feature_table(df: pd.DataFrame) -> pd.DataFrame:
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index must be a DatetimeIndex")

    df = df.sort_index()

    df = add_price_features(df)
    df = add_lag_features(df)
    df = add_rolling_features(df)
    df = add_moving_averages(df)
    df = add_volume_features(df)
    df = add_time_features(df)
    #df = add_target(df)

    df = df.drop(columns=["Open", "Volume", "High", "Low", ])
    # Drop rows with NaNs caused by lags/rolling
    df = df.dropna()

    return df



In [55]:
df_features = build_feature_table(df)
print(df_features.head())


Price            Close    return  log_return  return_lag_1  Close_lag_1  \
Date                                                                      
2025-12-12  278.279999  0.000899    0.000899     -0.002694   278.029999   
2025-12-15  274.109985 -0.014985   -0.015098      0.000899   278.279999   
2025-12-16  274.609985  0.001824    0.001822     -0.015098   274.109985   
2025-12-17  271.839996 -0.010087   -0.010138      0.001822   274.609985   
2025-12-18  272.190002  0.001288    0.001287     -0.010138   271.839996   

Price       return_lag_3  Close_lag_3  rolling_mean_2  rolling_std_2  \
Date                                                                   
2025-12-12     -0.002558   277.179993       -0.000898       0.002540   
2025-12-15      0.005756   278.779999       -0.007100       0.011312   
2025-12-16     -0.002694   278.029999       -0.006638       0.011965   
2025-12-17      0.000899   278.279999       -0.004158       0.008457   
2025-12-18     -0.015098   274.109985     

In [56]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df_features.tail(1)

Price,Close,return,log_return,return_lag_1,Close_lag_1,return_lag_3,Close_lag_3,rolling_mean_2,rolling_std_2,rolling_mean_3,rolling_std_3,sma_3,ema_3,sma_10,ema_10,volume_change,volume_rolling_mean_10,day_of_week,week_of_year,month,is_month_end
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2026-01-22,249.779999,0.008601,0.008564,0.003843,247.649994,-0.010433,255.529999,0.006204,0.003338,-0.007586,0.024002,248.04333,249.814134,255.753996,255.24226,-0.697737,48413234.9,3,4,1,0


In [57]:
df.to_csv(f'../data/{tickers}_gold.csv')

In [58]:
len(df_features)

27

In [59]:
dropped_df = df_features.dropna()
len(dropped_df)

27

In [60]:
dropped_df.head(50)

Price,Close,return,log_return,return_lag_1,Close_lag_1,return_lag_3,Close_lag_3,rolling_mean_2,rolling_std_2,rolling_mean_3,rolling_std_3,sma_3,ema_3,sma_10,ema_10,volume_change,volume_rolling_mean_10,day_of_week,week_of_year,month,is_month_end
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-12-12,278.279999,0.000899,0.000899,-0.002694,278.029999,-0.002558,277.179993,-0.000898,0.00254,0.00132,0.004241,278.363332,278.273573,280.308002,279.670384,0.189031,41127510.0,4,50,12,0
2025-12-15,274.109985,-0.014985,-0.015098,0.000899,278.279999,0.005756,278.779999,-0.0071,0.011312,-0.005631,0.008393,276.806661,276.191779,279.409,278.659403,0.275118,41509650.0,0,51,12,0
2025-12-16,274.609985,0.001824,0.001822,-0.015098,274.109985,-0.002694,278.029999,-0.006638,0.011965,-0.004126,0.009514,275.666656,275.400882,278.250998,277.923145,-0.253139,39907560.0,1,51,12,0
2025-12-17,271.839996,-0.010087,-0.010138,0.001822,274.609985,0.000899,278.279999,-0.004158,0.008457,-0.007805,0.008698,273.519989,273.620439,277.019998,276.817118,0.331755,40567560.0,2,51,12,0
2025-12-18,272.190002,0.001288,0.001287,-0.010138,271.839996,-0.015098,274.109985,-0.004426,0.008079,-0.002343,0.006756,272.879995,272.905221,276.168997,275.975824,0.029757,41331720.0,3,51,12,0
2025-12-19,273.670013,0.005437,0.005423,0.001287,272.190002,0.001822,274.609985,0.003355,0.002925,-0.001143,0.00806,272.566671,273.287617,275.657999,275.556586,1.801279,51068340.0,4,51,12,0
2025-12-22,270.970001,-0.009866,-0.009915,0.005423,273.670013,-0.010138,271.839996,-0.002246,0.010845,-0.001069,0.007935,272.276672,272.128809,274.965997,274.722661,-0.747139,50904340.0,0,52,12,0
2025-12-23,272.359985,0.00513,0.005117,-0.009915,270.970001,0.001287,272.190002,-0.002399,0.010629,0.000208,0.008768,272.333333,272.244397,274.483997,274.293084,-0.189485,50649210.0,1,52,12,0
2025-12-24,273.809998,0.005324,0.00531,0.005117,272.359985,0.005423,273.670013,0.005213,0.000137,0.00017,0.008735,272.379995,273.027197,273.986996,274.20525,-0.39577,49136440.0,2,52,12,0
2025-12-26,273.399994,-0.001497,-0.001499,0.00531,273.809998,-0.009915,270.970001,0.001906,0.004814,0.002976,0.003876,273.189992,273.213596,273.523996,274.05884,0.201624,47963820.0,4,52,12,0


In [61]:
df_features.isna().any().sum()

0