In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load data
ticker = "AAPL"
data = yf.download(ticker, start="2010-01-01", end="2022-01-01")
data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,7.6225,7.660714,7.585,7.643214,6.50528,493729600
2010-01-05,7.664286,7.699643,7.616071,7.656429,6.516527,601904800
2010-01-06,7.656429,7.686786,7.526786,7.534643,6.412872,552160000
2010-01-07,7.5625,7.571429,7.466071,7.520714,6.401017,477131200
2010-01-08,7.510714,7.571429,7.466429,7.570714,6.443572,447610800


In [7]:
# Calculate daily returns
data['Return'] = data['Adj Close'].pct_change()
data = data.dropna()


In [8]:
def generate_signals(data, short_window=50, long_window=200):
    # Create short-term and long-term moving averages
    data['Short_MA'] = data['Adj Close'].rolling(window=short_window).mean()
    data['Long_MA'] = data['Adj Close'].rolling(window=long_window).mean()

    # Generate signals
    data['Signal'] = np.where(data['Short_MA'] > data['Long_MA'], 1, 0)
    data['Signal'] = np.where(data['Short_MA'] < data['Long_MA'], -1, data['Signal'])
    
    # Shift signal to the next day (to avoid lookahead bias)
    data['Signal'] = data['Signal'].shift(1)

    return data


In [9]:
# Generate signals
data = generate_signals(data)

# Create target variable
data['Target'] = np.where(data['Signal'] == 1, 'Buy', np.where(data['Signal'] == -1, 'Sell', 'Hold'))

# Drop unnecessary columns
data = data.drop(['Open', 'High', 'Low', 'Close', 'Volume', 'Signal', 'Short_MA', 'Long_MA'], axis=1)

# Drop rows with missing values
data = data.dropna()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Short_MA'] = data['Adj Close'].rolling(window=short_window).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Long_MA'] = data['Adj Close'].rolling(window=long_window).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Signal'] = np.where(data['Short_MA'] > data['Long_MA'],

In [10]:
# Split data into training and testing sets
X = data.drop('Target', axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest classifier to training data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [11]:
# Predict on testing data
y_pred = rf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

         Buy       0.89      0.92      0.91       456
        Hold       1.00      0.98      0.99        47
        Sell       0.58      0.50      0.53       101

    accuracy                           0.85       604
   macro avg       0.82      0.80      0.81       604
weighted avg       0.85      0.85      0.85       604

[[420   0  36]
 [  1  46   0]
 [ 51   0  50]]
