In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
import yfinance as yf
stock = 'AAPL'
data = yf.download(stock, start='2015-01-01', end='2025-01-01')

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [3]:
data.to_csv('AAPL_data.csv')

In [4]:
stock_data = pd.read_csv('AAPL_data.csv')

In [5]:
stock_data.head()
stock_data.drop([0,1])


Unnamed: 0,Price,Close,High,Low,Open,Volume
0,Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
1,Date,,,,,
2,2015-01-02,24.3204345703125,24.789803736396227,23.879983264314035,24.77868056866345,212818400
3,2015-01-05,23.635292053222656,24.169171930797507,23.448435029507998,24.08908986430341,257142000
4,2015-01-06,23.63751220703125,23.89777802149258,23.274918008457785,23.699797868903328,263188400


### Preprocess the Data
Now i will clean and prepare the date for modeling, now we will focus on the 'Close' price for simplicity

The function `.pct_change()` implement the change in percentage by taking $$ Percent = \frac{Current - Previous}{Previous}$$

In [6]:
data['Return'] = data['Close'].pct_change()

Create the target variable (label) to indicate if the next day's closing price is higher:

- If `1` = Up (tomorrow's close > today's close)
- If `0` = Down (tomorrow's close <= today close)

In [5]:
data['Direction'] = (data['Close'].shift(-1) > data['Close']).astype(int)


Date
2015-01-02    0
2015-01-05    1
2015-01-06    1
2015-01-07    1
2015-01-08    1
             ..
2025-04-28    1
2025-04-29    1
2025-04-30    1
2025-05-01    0
2025-05-02    0
Name: Direction, Length: 2599, dtype: int64


Implement the Engineer Features


In [7]:
data['Lag1'] = data['Return'].shift(1) #Previous day's return
data['MA5'] = data['Close'].rolling(window=5).mean().shift(1)
data = data.dropna()

In [8]:
# Split the data
train_size = int(len(data) * 0.7)
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

In [9]:
# Prepare the Features and Labels
features = ['Lag1', 'MA5']
X_train = train_data[features]
y_train = train_data['Direction']
X_test = test_data[features]
y_test = test_data['Direction']

# Implement losgistic regression model

In [11]:
# Define sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [12]:
# Loss value calculation
def compute_loss(x_train, y_train, w, b):
    err = 0
    epsilon = 1e-15
    m,n = x_train.shape
    for i in range(m):
        g = np.dot(w,x_train[i]) + b
        h = sigmoid(g)
        err = err + y_train[i]*np.log(h+epsilon) + (1 - y_train[i])*np.log(1-h+epsilon)

    err = err/(-m)
    return err

In [13]:
# Implement gradient descent
def compute_gradient(x_train, y_train, w, b):
    m,n = x_train.shape
    dw_dj = np.zeros(n,)
    db_dj = 0
    sum = 0
    for i in range(m):
        sum = sigmoid(np.dot(w, x_train[i]) + b) - y_train[i] 
        for j in range(n):
            dw_dj[j] = dw_dj[j] + sum * x_train[i][j]
        db_dj = db_dj + sum

    dw_dj = dw_dj/m
    db_dj = db_dj/m
    return dw_dj, db_dj    

def gradient_descent(x_train, y_train, w, b, iters, lr):
    losses = []
    for i in range(iters):
        dw_dj, db_dj = compute_gradient(x_train, y_train, w, b)
        w = w - lr*dw_dj
        b = b - lr*db_dj
        loss = compute_loss(x_train, y_train, w, b)
        losses.append(loss)
        if i % math.ceil(iters / 18) == 0:
            print(f"Iteration: {i}, Cost value: {compute_loss(x_train, y_train, w, b)}")

    print("Complete gradient descent!")
    print(f"Weight: {w}, Bias: {b}\n")

    return w, b, losses

Scaling
`x_train = (x_train - np.mean(x_train,axis=0)) / np.std(x_train, axis=0)`

In [14]:
X_train = (X_train - np.mean(X_train,axis=0)) / np.std(X_train, axis=0)

In [16]:
X_test = (X_test - np.mean(X_test,axis=0)) / np.std(X_train, axis=0)