Source: @DeepCharts Youtube Channel (https://www.youtube.com/@DeepCharts)

# Tutorial: Predict Stock Prices with Technical Indicators and Machine Learning

## 1. Data Import and Technical Indicator Feature Engineering

### Import Libraries

In [1]:
import yfinance as yf
import pandas_ta as ta
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from plotly.subplots import make_subplots
import plotly.graph_objects as go

### Load in Stock Ticker Price with yfinance Library

In [2]:
# Load Ticker data
df = yf.download('NVDA', start="2022-10-25", end="2024-10-25")
df = df[['Open', 'High', 'Low', 'Close', 'Volume']]

[*********************100%%**********************]  1 of 1 completed


### Create 1-Trading Day Lag Structure

In [3]:
# Shift data backward by one day to ensure no data leakage
df['Previous_Close'] = df['Close'].shift(1)  # Add previous day's close as a feature
df['Close_shifted'] = df['Close'].shift(1)
df['Open_shifted'] = df['Open'].shift(1)
df['High_shifted'] = df['High'].shift(1)
df['Low_shifted'] = df['Low'].shift(1)

df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Previous_Close,Close_shifted,Open_shifted,High_shifted,Low_shifted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-10-25,12.694000,13.300000,12.664000,13.261000,505482000,,,,,
2022-10-26,12.869000,13.388000,12.708000,12.896000,532953000,13.261000,13.261000,12.694000,13.300000,12.664000
2022-10-27,13.630000,13.838000,13.122000,13.176000,583113000,12.896000,12.896000,12.869000,13.388000,12.708000
2022-10-28,13.104000,13.850000,13.061000,13.834000,521040000,13.176000,13.176000,13.630000,13.838000,13.122000
2022-10-31,13.778000,13.838000,13.297000,13.497000,486341000,13.834000,13.834000,13.104000,13.850000,13.061000
...,...,...,...,...,...,...,...,...,...,...
2024-10-18,138.669998,138.899994,137.279999,138.000000,176090200,136.929993,136.929993,139.339996,140.889999,136.869995
2024-10-21,138.130005,143.710007,138.000000,143.710007,264554500,138.000000,138.000000,138.669998,138.899994,137.279999
2024-10-22,142.910004,144.419998,141.779999,143.589996,226311600,143.710007,143.710007,138.130005,143.710007,138.000000
2024-10-23,142.029999,142.429993,137.460007,139.559998,285930000,143.589996,143.589996,142.910004,144.419998,141.779999


### Calculate Techinical Indicators based on the lagged/shifted columns

In [4]:
# Calculate technical indicators on the shifted data

# Simple Moving Average (SMA): Average price over the last 50 periods
df['SMA_50'] = ta.sma(df['Close_shifted'], length=50)

# Exponential Moving Average (EMA): Weighted average that reacts faster to recent price changes, using 50 periods
df['EMA_50'] = ta.ema(df['Close_shifted'], length=50)

# Relative Strength Index (RSI): Momentum indicator that measures the magnitude of recent price changes to evaluate overbought/oversold conditions, using a 14-period lookback
df['RSI'] = ta.rsi(df['Close_shifted'], length=14)

# Moving Average Convergence Divergence (MACD): Trend-following momentum indicator, using 12 and 26 periods for the fast and slow EMAs and a 9-period signal line
macd = ta.macd(df['Close_shifted'], fast=12, slow=26, signal=9)
df['MACD'] = macd['MACD_12_26_9']        # MACD line
df['Signal_Line'] = macd['MACDs_12_26_9'] # Signal line

# Bollinger Bands: Volatility indicator using a 20-period moving average and 2 standard deviations
bollinger = ta.bbands(df['Close_shifted'], length=20, std=2)
df['BB_Upper'] = bollinger['BBU_20_2.0']  # Upper Bollinger Band
df['BB_Middle'] = bollinger['BBM_20_2.0'] # Middle Band (20-period SMA)
df['BB_Lower'] = bollinger['BBL_20_2.0']  # Lower Bollinger Band

# Stochastic Oscillator: Momentum indicator comparing closing prices to price ranges over 14 periods with a 3-period %D moving average
stoch = ta.stoch(df['High_shifted'], df['Low_shifted'], df['Close_shifted'], k=14, d=3)
df['%K'] = stoch['STOCHk_14_3_3'] # %K line (main line)
df['%D'] = stoch['STOCHd_14_3_3'] # %D line (3-period moving average of %K)

# Average True Range (ATR): Volatility indicator measuring the average range of price movement over the last 14 periods
df['ATR'] = ta.atr(df['High_shifted'], df['Low_shifted'], df['Close_shifted'], length=14)

In [5]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Previous_Close,Close_shifted,Open_shifted,High_shifted,Low_shifted,...,EMA_50,RSI,MACD,Signal_Line,BB_Upper,BB_Middle,BB_Lower,%K,%D,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-25,12.694000,13.300000,12.664000,13.261000,505482000,,,,,,...,,,,,,,,,,
2022-10-26,12.869000,13.388000,12.708000,12.896000,532953000,13.261000,13.261000,12.694000,13.300000,12.664000,...,,,,,,,,,,
2022-10-27,13.630000,13.838000,13.122000,13.176000,583113000,12.896000,12.896000,12.869000,13.388000,12.708000,...,,,,,,,,,,
2022-10-28,13.104000,13.850000,13.061000,13.834000,521040000,13.176000,13.176000,13.630000,13.838000,13.122000,...,,,,,,,,,,
2022-10-31,13.778000,13.838000,13.297000,13.497000,486341000,13.834000,13.834000,13.104000,13.850000,13.061000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-18,138.669998,138.899994,137.279999,138.000000,176090200,136.929993,136.929993,139.339996,140.889999,136.869995,...,122.538644,64.339027,4.968295,3.960876,141.104337,126.616500,112.128664,78.684078,81.946493,5.161076
2024-10-21,138.130005,143.710007,138.000000,143.710007,264554500,138.000000,138.000000,138.669998,138.899994,137.279999,...,123.144972,65.338174,5.119328,4.192566,142.153681,127.716500,113.279320,85.845129,82.084824,4.933142
2024-10-22,142.910004,144.419998,141.779999,143.589996,226311600,143.710007,143.710007,138.130005,143.710007,138.000000,...,123.951444,70.145283,5.634818,4.481016,144.115822,129.089001,114.062180,91.132678,85.220628,4.988633
2024-10-23,142.029999,142.429993,137.460007,139.559998,285930000,143.589996,143.589996,142.910004,144.419998,141.779999,...,124.721583,69.925782,5.964904,4.777794,146.010714,130.225000,114.439287,95.109950,90.695919,4.820873


### Drop Trading Days With Missing Data (beginning of time series days)

In [6]:
# Drop rows with missing values due to shifting and indicator calculation
df.dropna(inplace=True)

In [7]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Previous_Close,Close_shifted,Open_shifted,High_shifted,Low_shifted,...,EMA_50,RSI,MACD,Signal_Line,BB_Upper,BB_Middle,BB_Lower,%K,%D,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-06,14.474000,15.010000,14.034000,14.859000,405044000,14.265000,14.265000,14.491000,14.564000,14.148000,...,15.417725,39.620701,-0.400730,-0.140791,18.417812,15.859100,13.300388,13.225583,13.496130,0.721511
2023-01-09,15.284000,16.056000,15.141000,15.628000,504231000,14.859000,14.859000,14.474000,15.010000,14.034000,...,15.395815,45.403066,-0.381580,-0.188949,18.387870,15.796050,13.204230,20.591784,16.102234,0.740113
2023-01-10,15.507000,15.962000,15.472000,15.909000,384101000,15.628000,15.628000,15.284000,16.056000,15.141000,...,15.404920,51.834114,-0.300884,-0.211336,18.233445,15.719000,13.204555,35.184088,23.000485,0.773455
2023-01-11,15.840000,16.028000,15.563000,16.000999,353285000,15.909000,15.909000,15.507000,15.962000,15.472000,...,15.424688,53.967845,-0.211815,-0.211432,18.111648,15.664400,13.217152,56.096015,37.290629,0.752801
2023-01-12,16.100000,16.636999,15.492000,16.511000,551409000,16.000999,16.000999,15.840000,16.028000,15.563000,...,15.447288,54.675778,-0.132279,-0.195601,17.887335,15.587700,13.288065,71.527507,54.269203,0.731861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-18,138.669998,138.899994,137.279999,138.000000,176090200,136.929993,136.929993,139.339996,140.889999,136.869995,...,122.538644,64.339027,4.968295,3.960876,141.104337,126.616500,112.128664,78.684078,81.946493,5.161076
2024-10-21,138.130005,143.710007,138.000000,143.710007,264554500,138.000000,138.000000,138.669998,138.899994,137.279999,...,123.144972,65.338174,5.119328,4.192566,142.153681,127.716500,113.279320,85.845129,82.084824,4.933142
2024-10-22,142.910004,144.419998,141.779999,143.589996,226311600,143.710007,143.710007,138.130005,143.710007,138.000000,...,123.951444,70.145283,5.634818,4.481016,144.115822,129.089001,114.062180,91.132678,85.220628,4.988633
2024-10-23,142.029999,142.429993,137.460007,139.559998,285930000,143.589996,143.589996,142.910004,144.419998,141.779999,...,124.721583,69.925782,5.964904,4.777794,146.010714,130.225000,114.439287,95.109950,90.695919,4.820873


## 2. Modeling: Backtest

### Choose # of days for rolling training data window and Choose Technical Indicators 

In [8]:
# Parameters
window_size = 20  # 4 weeks of trading days (5 days per week * 4)

# List of indicators to test, including Previous_Close
indicators = ['SMA_50', 'EMA_50', 'RSI', 'MACD', 'Signal_Line', 'BB_Upper', 'BB_Middle', 'BB_Lower', '%K', '%D', 'ATR', 'Close_shifted', 'Previous_Close']

# Initialize a dictionary to store predictions, actuals, daily MAE for each indicator
results = {indicator: {'predictions': [], 'actual': [], 'daily_mae': []} for indicator in indicators}

### Loop over multiple 20-Day Train Datasets for Model Building and Next Day Test Datasets for Model Evaluation

In [9]:
# Sequentially predict the actual close price using a rolling 4 weeks window, set by window_size
for i in range(window_size, len(df) - 1):
    train_df = df.iloc[i - window_size:i]  # Training window 
    test_index = i + 1  # Index of next day's prediction
    actual_close_price = df['Close'].iloc[test_index]  # Next day's actual closing price

    # Individual indicators as predictors (plus Previous_Close)
    for indicator in indicators[:-1]:  # Exclude Previous_Close from standalone tests
        X_train = train_df[[indicator, 'Previous_Close']]
        y_train = train_df['Close']
        X_train = sm.add_constant(X_train)  # Add constant for intercept

        model = sm.OLS(y_train, X_train).fit()
        X_test = pd.DataFrame({indicator: [df[indicator].iloc[test_index]], 'Previous_Close': [df['Previous_Close'].iloc[test_index]]})
        X_test = sm.add_constant(X_test, has_constant='add')  # Add constant for prediction

        prediction = model.predict(X_test)[0]
        results[indicator]['predictions'].append(prediction)
        results[indicator]['actual'].append(actual_close_price)
        
        daily_mae = mean_absolute_error([actual_close_price], [prediction])
        results[indicator]['daily_mae'].append(daily_mae)



## 3. Prediction Evaluation

In [10]:
# Calculate accuracy metrics (MAE, MSE) for each individual indicator and the combined model
accuracy_data = {
    'Indicator': [],
    'MAE': [],
    'MSE': []
}

for indicator in indicators[:-1]:  # Exclude Previous_Close from standalone tests in accuracy table
    if results[indicator]['actual']:  # Check if there are results for this indicator
        mae = mean_absolute_error(results[indicator]['actual'], results[indicator]['predictions'])
        mse = mean_squared_error(results[indicator]['actual'], results[indicator]['predictions'])
        accuracy_data['Indicator'].append(indicator)
        accuracy_data['MAE'].append(mae)
        accuracy_data['MSE'].append(mse)


# Create accuracy DataFrame
accuracy_df = pd.DataFrame(accuracy_data).sort_values(by='MAE').reset_index(drop=True)
accuracy_df

Unnamed: 0,Indicator,MAE,MSE
0,Close_shifted,1.825278,7.765217
1,MACD,1.95396,9.289746
2,EMA_50,1.976608,8.993002
3,%D,2.000034,9.490673
4,Signal_Line,2.006564,10.237693
5,BB_Upper,2.021081,9.107488
6,%K,2.034116,9.888641
7,RSI,2.0926,10.429998
8,ATR,2.119318,10.869468
9,BB_Middle,2.128578,10.78885


## 4. Plotting the Results

In [11]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create faceted plot with each indicator's daily MAE
fig = make_subplots(rows=len(indicators), cols=1, shared_xaxes=True, vertical_spacing=0.02,
                    subplot_titles=[f"{indicator} Daily MAE" for indicator in indicators[:-1]])

# Find the global y-axis range across all indicators
y_values = [results[indicator]['daily_mae'] for indicator in indicators[:-1]]
y_min = min(min(y) for y in y_values)
y_max = max(max(y) for y in y_values)

# Add each individual indicator's daily MAE
for idx, indicator in enumerate(indicators[:-1]):
    fig.add_trace(
        go.Scatter(
            x=df.index[window_size + 1:],  # Start date after the initial window
            y=results[indicator]['daily_mae'],
            mode='lines',
            name=f'{indicator} Daily MAE'
        ),
        row=idx + 1, col=1
    )

# Update layout with shared y-axis range and individual x-axis labels
fig.update_yaxes(range=[y_min, y_max])  # Apply the common y-axis range across all subplots
fig.update_xaxes(title_text="Date", row=len(indicators), col=1)  # Add x-axis label for the last row

# Final layout adjustments
fig.update_layout(
    height=150 * (len(indicators)),  # Adjust height for the combined model
    title="Daily MAE of Each Technical Indicator on NVDA Closing Price",
    yaxis_title="Daily MAE",
    showlegend=False,
    template="plotly_white"
)

fig.show()


In [12]:
# Create the figure
fig = go.Figure()

# Add Close price
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Close Price', line=dict(color='white', width=1)))

# Add SMA, EMA
fig.add_trace(go.Scatter(x=df.index, y=df['SMA_50'], mode='lines', name='SMA 50', line=dict(color='yellow', width=1)))
fig.add_trace(go.Scatter(x=df.index, y=df['EMA_50'], mode='lines', name='EMA 50', line=dict(color='orange', width=1)))

# Add Bollinger Bands
fig.add_trace(go.Scatter(x=df.index, y=df['BB_Upper'], mode='lines', name='BB Upper', line=dict(color='blue', width=1, dash='dot')))
fig.add_trace(go.Scatter(x=df.index, y=df['BB_Lower'], mode='lines', name='BB Lower', line=dict(color='blue', width=1, dash='dot')))
fig.add_trace(go.Scatter(x=df.index, y=df['BB_Middle'], mode='lines', name='BB Middle', line=dict(color='blue', width=1)))

# Add MACD and Signal Line
fig.add_trace(go.Scatter(x=df.index, y=df['MACD'], mode='lines', name='MACD', line=dict(color='cyan', width=1)))
fig.add_trace(go.Scatter(x=df.index, y=df['Signal_Line'], mode='lines', name='Signal Line', line=dict(color='purple', width=1)))

# Configure layout
fig.update_layout(
    title="Overlay of Technical Indicators on NVDA Close Price",
    xaxis_title="Date",
    yaxis_title="Price",
    template="plotly_dark",
    plot_bgcolor='black',
    paper_bgcolor='black',
    font=dict(color="white"),
    width=800,  # Width of the slide, adjust as needed
    height=600   # Height of the slide, adjust as needed
)

fig.show()