In [1]:
!pip install yfinance pandas numpy scikit-learn joblib

Collecting yfinance
  Downloading yfinance-0.2.65-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting joblib
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pytz>=2022.5 (from yfinance)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py312-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
[2K     [9

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split # <-- New import
import os
import joblib

In [16]:
TICKER = 'AAPL'
START_DATE = '2015-01-01'
END_DATE = '2025-08-21'

In [17]:
FEATURE = 'Close'
TIME_STEPS = 60
TEST_SPLIT = 0.3

In [18]:
OUTPUT_DIR = 'processed_data'
DATA_FILE = os.path.join(OUTPUT_DIR, f'{TICKER}_processed.npz')
SCALER_FILE = os.path.join(OUTPUT_DIR, 'scaler.joblib')

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [19]:
data = yf.download(TICKER, start=START_DATE, end=END_DATE)
if data.empty:
    raise ValueError(f"No data fetched for ticker {TICKER}.")
data.tail()

  data = yf.download(TICKER, start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-08-14,232.779999,235.119995,230.850006,234.059998,51916300
2025-08-15,231.589996,234.279999,229.339996,234.0,56038700
2025-08-18,230.889999,233.119995,230.110001,231.699997,37476200
2025-08-19,230.559998,232.869995,229.350006,231.279999,39320800
2025-08-20,226.770004,230.469894,226.110001,229.970001,16749328


In [21]:
df = data[FEATURE].values.reshape(-1, 1)

In [23]:
training_data_len = int(len(dataset) * (1 - TEST_SPLIT))
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(df[:training_data_len])

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [24]:
scaled_data = scaler.transform(dataset)

In [26]:
scaled_df = pd.DataFrame(scaled_data, columns=[FEATURE])
scaled_df.to_csv(DATA_FILE, index=False)

# Save the scaler object itself
joblib.dump(scaler, SCALER_FILE)

['processed_data/scaler.joblib']