In [None]:
import yfinance as yf
import pandas as pd

In [None]:
sp500 = yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")

In [None]:
sp500

In [None]:
sp500.plot.line(y="Close", use_index=True)
#y axis is closing price and x axis is date

In [None]:
del sp500["Dividends"]
del sp500["Stock Splits"]

In [None]:
#shifts the close price up one day
sp500["Tomorrow"] = sp500["Close"].shift(-1)
#creates a new column called target and sets it to 1 if tomorrow is greater than today
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)

In [None]:
#start the date at 1990
sp500 = sp500.loc["1990-01-01":].copy()


In [None]:
#split the data into training and testing
#choose Random Forest Classifier because it is a classification problem
from sklearn.ensemble import RandomForestClassifier
#n_estimators is the number of trees in the forest
#min_samples_split is the minimum number of samples required to split an internal node
#random_state is the seed used by the random number generator
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

In [None]:
from sklearn.metrics import precision_score
#predicts the target for the test data
preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)
precision_score(test["Target"], preds)

In [None]:
#combined is a dataframe that contains the target and the predictions
combined = pd.concat([test["Target"], preds], axis=1)
combined.plot()
#we can see that the predictions are not very accurate and are not very close to the target


In [None]:
#predicts the target for the training data
def predict(train, test, predictors, model):
    #train predictors against target
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
#backtest the model
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    #for eg we can take first 10years as training data and the next 1 year as testing data and then we take 11 years as training data and the next 1 year as testing data
    for i in range(start, data.shape[0], step):
        #train data is from 0 to i
        train = data.iloc[0:i].copy()
        #test data is from i to i+step
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(sp500, model, predictors)

In [None]:
#plot the predictions
predictions["Predictions"].value_counts().plot.bar()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])

In [None]:
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors+= [ratio_column, trend_column]

In [None]:
sp500 = sp500.dropna(subset=sp500.columns[sp500.columns != "Tomorrow"])
sp500

In [None]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.7] = 1
    preds[preds <.7] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
predictions = backtest(sp500, model, new_predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])