In [2]:
import yfinance as yf
import pandas as pd
import pandas_ta as ta
import numpy as np

ticker_symbol = "AAPL"
nan_threshold = 0.9
lags = [1, 3, 5]
include_fundamentals = False 
important_metrics = ['P/E', 'EPS (ttm)', 'ROA', 'ROE', 'Debt/Eq', 'Gross Margin']

ticker = yf.Ticker(ticker_symbol)
price_df = ticker.history(period="5y", interval="1d", actions=True)
price_df = price_df[["Open", "High", "Low", "Close", "Volume"]]
price_df.dropna(inplace=True)

price_df.ta.rsi(append=True)
price_df.ta.macd(append=True)
price_df.ta.ema(length=20, append=True)
price_df.ta.ema(length=50, append=True)
price_df.ta.ema(length=100, append=True)
price_df.ta.sma(length=20, append=True)
price_df.ta.sma(length=50, append=True)
price_df.ta.sma(length=100, append=True)
price_df.ta.bbands(append=True)
price_df.ta.adx(append=True)
price_df.ta.obv(append=True)
price_df.ta.stoch(append=True)
price_df.ta.mom(append=True)
price_df.ta.cmo(append=True)
price_df.ta.roc(append=True)
price_df.ta.willr(append=True)
price_df.ta.trix(append=True)
price_df.ta.efi(append=True)
price_df.ta.ao(append=True)
price_df.ta.kc(append=True)
price_df.ta.vortex(append=True)
price_df.ta.pvi(append=True)
price_df.ta.nvi(append=True)

for col in price_df.columns[:40]:  
    if pd.api.types.is_numeric_dtype(price_df[col]):
        for lag in lags:
            price_df[f"{col}_lag{lag}"] = price_df[col].shift(lag)

price_df["Target"] = price_df["Close"].shift(-1) / price_df["Close"] - 1
price_df.dropna(inplace=True)

if include_fundamentals:
    from finvizfinance.quote import finvizfinance
    stock = finvizfinance(ticker_symbol)
    fundamentals_raw = stock.ticker_fundament()
    fundamentals_df = pd.DataFrame(fundamentals_raw.items(), columns=["Metric", "Value"]).set_index("Metric").T

    def clean_value(val):
        if isinstance(val, str):
            val = val.replace('%', '').replace(',', '').replace('B', 'e9').replace('M', 'e6').replace('K', 'e3').replace('T', 'e12')
            if val in ['N/A', '-', '']:
                return np.nan
        try:
            return eval(val)
        except:
            return np.nan

    for col in fundamentals_df.columns:
        fundamentals_df[col] = clean_value(fundamentals_df[col].values[0])

    fundamentals_df = fundamentals_df.dropna(axis=1, thresh=(1 - nan_threshold) * len(fundamentals_df))
    filtered_cols = [col for col in important_metrics if col in fundamentals_df.columns]
    fundamentals_df = fundamentals_df[filtered_cols]

    for col in fundamentals_df.columns:
        price_df[col] = fundamentals_df[col].values[0]

print(f"Dataset ready with shape: {price_df.shape}")
price_df.head()


Dataset ready with shape: (1151, 162)


Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI_14,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,EMA_20,...,VTXP_14_lag1,VTXP_14_lag3,VTXP_14_lag5,VTXM_14_lag1,VTXM_14_lag3,VTXM_14_lag5,PVI_1_lag1,PVI_1_lag3,PVI_1_lag5,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-15 00:00:00-05:00,121.293892,124.766684,121.089038,124.747169,157243700,65.958409,2.009719,0.374756,1.634963,118.315644,...,1.154414,1.048711,1.078362,0.798156,0.83866,0.831579,995.458805,996.132277,998.222649,-0.000547
2020-12-16 00:00:00-05:00,124.288684,125.225157,123.459502,124.678879,98208600,65.760916,2.26617,0.504966,1.761204,118.921667,...,1.177898,1.033242,1.041521,0.638079,0.852037,0.796947,1000.467845,995.458805,996.132277,0.006963
2020-12-17 00:00:00-05:00,125.742163,126.405512,124.903231,125.547066,94359800,67.109232,2.510525,0.599456,1.911068,119.552657,...,1.173246,1.154414,1.048711,0.650586,0.798156,0.83866,1000.467845,995.458805,996.132277,-0.015851
2020-12-18 00:00:00-05:00,125.80071,125.937279,123.030281,123.557053,192541500,61.163701,2.514613,0.482836,2.031777,119.934028,...,1.181379,1.177898,1.033242,0.640872,0.638079,0.852037,1000.467845,1000.467845,995.458805,0.012395
2020-12-21 00:00:00-05:00,121.957233,125.166634,120.425695,125.088593,121251600,63.820325,2.611334,0.463645,2.147689,120.424939,...,1.130414,1.173246,1.154414,0.732917,0.650586,0.798156,998.882771,1000.467845,995.458805,0.028464


In [7]:
price_df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,RSI_14,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,EMA_20,...,VTXP_14_lag1,VTXP_14_lag3,VTXP_14_lag5,VTXM_14_lag1,VTXM_14_lag3,VTXM_14_lag5,PVI_1_lag1,PVI_1_lag3,PVI_1_lag5,Target
count,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,...,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0,1151.0
mean,171.452247,173.346932,169.725467,171.630542,72510560.0,53.090943,0.539249,-7e-06,0.539256,170.892845,...,0.970818,0.970631,0.970433,0.901172,0.901394,0.901712,975.789012,975.834573,975.876836,0.000651
std,34.121348,34.370911,33.903627,34.184787,30393890.0,12.052505,3.035808,0.990647,2.826554,33.824241,...,0.180699,0.18051,0.18042,0.168701,0.168517,0.168557,12.867334,12.894235,12.918768,0.017948
min,116.287499,117.625931,113.532471,113.679016,23234700.0,20.057863,-10.616152,-4.337874,-8.325011,115.88993,...,0.50695,0.50695,0.50695,0.39935,0.39935,0.39935,939.872743,939.872743,939.872743,-0.092456
25%,144.087916,145.73955,142.720128,144.12532,50753400.0,43.613826,-1.627311,-0.594448,-1.396381,144.012233,...,0.831203,0.831203,0.831203,0.774333,0.774754,0.774754,970.394993,970.394993,970.394993,-0.008423
50%,168.339683,170.212035,167.230036,168.70163,66015800.0,53.053445,0.664313,-0.003899,0.498451,167.139021,...,0.968045,0.968045,0.968045,0.899012,0.899012,0.899189,977.882484,977.976036,977.976036,0.00114
75%,192.734442,194.035856,190.801264,192.350739,87711750.0,62.621432,2.709555,0.677119,2.618841,189.478137,...,1.104902,1.104476,1.104211,1.021541,1.021541,1.021886,984.281524,984.356849,984.392474,0.010226
max,257.568678,259.474086,257.010028,258.396667,318679900.0,81.320686,8.913714,2.903533,8.281467,248.469799,...,1.431602,1.431602,1.431602,1.348395,1.348395,1.348395,1005.070226,1005.070226,1005.070226,0.153288


In [10]:
price_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI_14,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,EMA_20,...,VTXP_14_lag1,VTXP_14_lag3,VTXP_14_lag5,VTXM_14_lag1,VTXM_14_lag3,VTXM_14_lag5,PVI_1_lag1,PVI_1_lag3,PVI_1_lag5,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-04 00:00:00-05:00,119.596528,119.850161,118.542984,119.255104,78260400,59.773385,1.497177,0.562860,0.934316,115.889930,...,1.065005,1.110715,0.984632,0.867697,0.788434,1.028309,996.534312,996.534312,994.424367,0.012270
2020-12-07 00:00:00-05:00,119.313628,121.518264,119.255101,120.718353,86712000,62.503466,1.691734,0.605934,1.085800,116.349780,...,1.063694,1.087227,0.996623,0.865662,0.855281,1.003941,996.534312,996.534312,996.534312,0.005091
2020-12-08 00:00:00-05:00,121.323159,121.918216,120.074510,121.332909,82225500,63.620200,1.873910,0.630488,1.243422,116.824363,...,1.077895,1.065005,1.110715,0.852145,0.867697,0.788434,997.761303,996.534312,996.534312,-0.020904
2020-12-09 00:00:00-05:00,121.479254,122.864464,118.035733,118.796623,115089200,56.183365,1.792961,0.439631,1.353330,117.012198,...,1.078362,1.063694,1.087227,0.831579,0.865662,0.855281,997.761303,996.534312,996.534312,0.011989
2020-12-10 00:00:00-05:00,117.547990,120.835434,117.206566,120.220863,81312200,59.076295,1.822721,0.375513,1.447208,117.317785,...,1.041522,1.077895,1.065005,0.796946,0.852145,0.867697,995.670950,997.761303,996.534312,-0.006735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-07-01 00:00:00-04:00,206.669998,210.190002,206.139999,207.820007,78788900,60.683982,0.262378,1.006341,-0.743963,201.744386,...,0.972432,0.947330,0.953819,0.960853,0.999076,0.973179,971.156821,969.083004,969.360836,0.022231
2025-07-02 00:00:00-04:00,208.910004,213.339996,208.139999,212.440002,67941800,66.312941,1.065211,1.447339,-0.382128,202.763016,...,1.069362,0.946613,0.934450,0.864331,1.033295,1.007384,971.156821,969.122806,969.360836,0.005225
2025-07-03 00:00:00-04:00,212.149994,214.649994,211.809998,213.550003,34955800,67.516287,1.770619,1.722198,0.048421,203.790348,...,1.139797,0.972432,0.947330,0.802651,0.960853,0.999076,971.156821,971.156821,969.083004,-0.016858
2025-07-07 00:00:00-04:00,212.679993,216.229996,208.800003,209.949997,50229000,60.027025,2.015931,1.574008,0.441923,204.376981,...,1.223382,1.069362,0.946613,0.694245,0.864331,1.033295,971.156821,971.156821,969.122806,0.000286


In [8]:
df = price_df.copy()

In [9]:
df.index = pd.to_datetime(df.index)
df = df.sort_index()

In [12]:
df.to_csv(f"{ticker_symbol}_dataset.csv", index=True)