In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import joblib

### Data import

In [2]:
df = yf.download('AAPL', start='2010-01-01', end='2024-6-30')

[*********************100%%**********************]  1 of 1 completed


In [3]:
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-06-24,207.720001,212.699997,206.589996,208.139999,208.139999,80727000
2024-06-25,209.149994,211.380005,208.610001,209.070007,209.070007,56713900
2024-06-26,211.5,214.860001,210.639999,213.25,213.25,66213200
2024-06-27,214.690002,215.740005,212.350006,214.100006,214.100006,49772700
2024-06-28,215.770004,216.070007,210.300003,210.619995,210.619995,82542700


### Scaling

In [4]:
# Scale close price feature

scaler = MinMaxScaler()

df['Close'] = scaler.fit_transform(df['Close'].values.reshape(-1, 1))

In [5]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,7.6225,7.660714,7.585,0.003738,6.461977,493729600
2010-01-05,7.664286,7.699643,7.616071,0.003801,6.47315,601904800
2010-01-06,7.656429,7.686786,7.526786,0.003221,6.370185,552160000
2010-01-07,7.5625,7.571429,7.466071,0.003154,6.358409,477131200
2010-01-08,7.510714,7.571429,7.466429,0.003393,6.400682,447610800


### Feature engineering

In [7]:
static_features = ['Rolling_Mean_5', 'Rolling_Mean_10', 'Rolling_Mean_30', 'Day', 'Month', 'Year']

In [8]:
# Add rolling mean features

df['Rolling_Mean_5'] = df['Close'].shift(1).rolling(window=5).mean()
df['Rolling_Mean_10'] = df['Close'].shift(1).rolling(window=10).mean()
df['Rolling_Mean_30'] = df['Close'].shift(1).rolling(window=30).mean()

In [9]:
# Add date features

df['Day'] = df.index.day
df['Month'] = df.index.month
df['Year'] = df.index.year

In [10]:
# Scale static features

date_scaler = MinMaxScaler()

df[['Day', 'Month', 'Year']] = date_scaler.fit_transform(df[['Day', 'Month', 'Year']].values)

In [11]:
# Define function to create supervised learning dataset

def create_dataset(df, lag):
    # Initialize feature and target matrices
    X, y = [], []
    
    # Loop through valid dates
    for i in range(len(df) - lag):
        # Get lagged close prices
        close_prices = df.iloc[i:(i + lag), [3]].values
        
        # Add features to dataframe
        X.append(close_prices)
        y.append(df.iloc[i + lag, 3])
        
    return np.array(X), np.array(y)

In [12]:
# Create feature and target matrices

X, y = create_dataset(df, 30)

In [13]:
# Create static features matrix

X_static = df.iloc[30:][static_features].values

In [14]:
# Create date vector

dates = df.index[30:]

In [15]:
# Display shapes

print(f'X: {X.shape}')
print(f'y: {y.shape}')
print(f'X_static: {X_static.shape}')
print(f'dates: {dates.shape}')

X: (3616, 30, 1)
y: (3616,)
X_static: (3616, 6)
dates: (3616,)


### Data export

In [16]:
np.save('../data/X.npy', X)
np.save('../data/y.npy', y)
np.save('../data/dates.npy', dates)
np.save('../data/X_static.npy', X_static)

In [17]:
df.to_csv('../data/stock_data.csv')

### Scaler export

In [18]:
joblib.dump(scaler, '../models/price_scaler.pkl')
joblib.dump(date_scaler, '../models/date_scaler.pkl')

['../models/date_scaler.pkl']