In [1]:
# ! Followed tutorial("https://www.youtube.com/watch?v=TF2Nx_ifmrU")

import yfinance as yf
import os
import pandas as pd

In [2]:
# * Configuration
ticker_name = "TSLA"
related_ticker_names = [
    "SPY", 
    "QQQ", 
    "ITOT", 
    "VTI", 
    "AMZN", 
    "AAPL", 
    "AMC", 
    "NVDA", 
    "MSFT", 
    "ROKU", 
    "GOOGL", 
    "NFLX",
    "DOGE-USD",
    "BTC-USD",
    "ETH-USD"
]
related_ticker_predictor_fields = ["open", "high", "low", "close", "volume"]
# related_ticker_predictor_fields = ["close", "volume"]
predictors = ["close", "volume", "open", "high", "low", "edit_count", "sentiment", "neg_sentiment"]
# predictors = ["close", "volume", "edit_count", "sentiment", "neg_sentiment"]
for i in range(0,len(related_ticker_names)):
    for related_predictor in related_ticker_predictor_fields:
        predictors.append(f"{related_ticker_names[i]}_{related_predictor}")
period = '2y'
backtest_start = 7
backtest_step = 1
data_split_point = 73 # cutoff in days ago - want to follow best practices: 80% train, 20% test -- for 2y period, want 73 days ago 

In [3]:
# * Load data
historical_data = pd.read_csv("historical_data.csv",index_col=0, parse_dates=True).dropna()

In [4]:
# * Split data into train/test sets
train_set = historical_data.iloc[:-data_split_point] # everything except last 200 days
test_set = historical_data[-data_split_point:] # last 200 days

In [5]:
historical_data.index = pd.to_datetime(historical_data.index.tz_localize(None))
# historical_data.columns = [c.lower() for c in historical_data.columns]

In [6]:
# * Build prediction model
# Baseline model - Random Forest
from sklearn.ensemble import RandomForestClassifier
# n estimators in number of individual estimators we want to train
# min sample split is minimum number of samples before an individual decision tree will split its nodes
#     - higher number will result in lower performance on the training set, will result in less overfit
#     - lower number will result in higher performance on the trainin set, but will result in more overfit
model = RandomForestClassifier(n_estimators=100, min_samples_split=50, random_state=1)

model.fit(train_set[predictors], train_set["target"])

In [7]:
from sklearn.metrics import precision_score

predictions = model.predict(test_set[predictors])
predictions = pd.Series(predictions, index=test_set.index)
precision_score(test_set["target"], predictions)

0.5

In [8]:
# * Backtest historical dataframe
# To get a better error estimation, we want to backtest
def predict(train_set, test_set, predictors, model):
    model.fit(train_set[predictors], train_set["target"])
    predictions = model.predict(test_set[predictors])
    predictions = pd.Series(predictions, index=test_set.index, name="predictions")
    combined = pd.concat([test_set["target"], predictions], axis=1)
    return combined

In [9]:
# Because we don't have historical data for early periods of the data frame, we need to skip about 3 years (1095 days)
# Want to generate new predictions about every 6 months (150 days)
def backtest(data, model, predictors, start=backtest_start, step=backtest_step):
    try:
        all_predictions = []

        for i in range(start, data.shape[0], step):
            train = data.iloc[0:i].copy()
            test = data.iloc[i:(i+step)].copy()
            predictions = predict(train, test, predictors, model)            
            all_predictions.append(predictions)
            
        return pd.concat(all_predictions)

    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print(inst)          # __str__ allows args to be printed directly,
                             # but may be overridden in exception subclasses
        print('all_predictions', all_predictions)


In [10]:
# Upgrade the model to an XGBoost classifier
from xgboost import XGBClassifier

model = XGBClassifier(random_state=1, learning_rate=.1, n_estimators=200)
predictions = backtest(historical_data, model, predictors)

precision_score(predictions["target"], predictions["predictions"])

0.5444839857651246

In [11]:
# * Compute calculated predictors
def compute_rolling(historical_data):
    horizons = [2, 7, 60, 365]
    new_predictors = predictors.copy()
        
    for horizon in horizons:
        rolling_averages = historical_data.rolling(horizon, min_periods=1).mean() # min_periods avoids NA valus by computing data for periods of 1

        ratio_column = f"close_ratio_{horizon}"
        historical_data[ratio_column] = historical_data["close"] / rolling_averages["close"]

        edit_column = f"edit_{horizon}"
        historical_data[edit_column] = rolling_averages["edit_count"]

        rolling = historical_data.rolling(horizon, closed="left", min_periods=1).mean()
        trend_column = f"trend_{horizon}"
        historical_data[trend_column] = rolling["target"]

        new_predictors += [ratio_column, trend_column, edit_column]
    return historical_data, new_predictors

In [12]:
historical_data, new_predictors = compute_rolling(historical_data.copy())

In [13]:
historical_data

Unnamed: 0,open,high,low,close,volume,edit_count,sentiment,neg_sentiment,SPY_open,SPY_high,...,trend_2,close_ratio_7,edit_7,trend_7,close_ratio_60,edit_60,trend_60,close_ratio_365,edit_365,trend_365
2021-08-02,233.333328,242.313339,232.800003,236.556671,100847400,4.366667,-0.010681,0.044865,427.191076,427.763455,...,,1.000000,4.366667,,1.000000,4.366667,,1.000000,4.366667,
2021-08-03,239.666672,240.883331,233.669998,236.580002,64860900,4.333333,-0.010681,0.044865,425.347829,428.103021,...,1.0,1.000049,4.350000,1.000000,1.000049,4.350000,1.000000,1.000049,4.350000,1.000000
2021-08-04,237.000000,241.633331,236.309998,236.973328,51007800,4.333333,-0.010681,0.044865,426.647804,427.947787,...,1.0,1.001141,4.344444,1.000000,1.001141,4.344444,1.000000,1.001141,4.344444,1.000000
2021-08-05,238.666672,240.316666,237.136673,238.210007,38758800,4.300000,-0.010681,0.044865,427.074672,428.656004,...,1.0,1.004766,4.333333,1.000000,1.004766,4.333333,1.000000,1.004766,4.333333,1.000000
2021-08-06,237.300003,238.776672,232.543335,233.033340,46869000,4.300000,-0.010681,0.044865,428.898517,429.713430,...,0.5,0.986298,4.326667,0.750000,0.986298,4.326667,0.750000,0.986298,4.326667,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-25,272.380005,272.899994,265.000000,265.279999,112757300,0.933333,-0.006656,0.050000,453.920013,456.739990,...,0.5,0.961040,0.985714,0.428571,1.168980,1.239444,0.666667,1.137745,2.120822,0.531507
2023-07-26,263.250000,268.040009,261.750000,264.350006,95856200,0.966667,-0.006656,0.050000,454.470001,456.989990,...,0.0,0.970748,0.961905,0.285714,1.156386,1.243333,0.650000,1.134374,2.116530,0.528767
2023-07-27,268.309998,269.130005,255.300003,255.710007,103697300,1.133333,0.004056,0.050000,459.019989,459.440002,...,0.0,0.957931,0.976190,0.142857,1.110986,1.243889,0.650000,1.097889,2.112603,0.528767
2023-07-28,259.859985,267.250000,258.230011,266.440002,111446000,1.166667,0.004056,0.050000,455.880005,457.779999,...,0.5,1.011563,0.995238,0.285714,1.148777,1.245000,0.666667,1.144231,2.108584,0.531507


In [14]:
# * Backtest & Evaluate
predictions = backtest(historical_data, model, new_predictors)

In [15]:
precision_score(predictions["target"], predictions["predictions"])

0.5457627118644067

In [16]:
predictions

Unnamed: 0,target,predictions
2021-08-11,1,1
2021-08-12,0,1
2021-08-13,0,1
2021-08-16,0,1
2021-08-17,1,0
...,...,...
2023-07-25,0,0
2023-07-26,0,0
2023-07-27,1,0
2023-07-28,1,0


In [17]:
# # * Update model to transformer
# from transformers import TimeSeriesTransformerForPrediction

# transformer_model = TimeSeriesTransformerForPrediction.from_pretrained("huggingface/time-series-transformer-tourism-monthly")

# # Train model
# outputs = transformer_model(
#     past_values=historical_data["close"],
#     past_time_features=batch["past_time_features"],
#     past_observed_mask=batch["past_observed_mask"],
#     static_categorical_features=batch["static_categorical_features"],
#     static_real_features=batch["static_real_features"],
#     future_values=batch["future_values"],
#     future_time_features=batch["future_time_features"],
# )

# loss = outputs.loss
# loss.backward()

# # during inference, one only provides past values
# # as well as possible additional features
# # the model autoregressively generates future values
# outputs = model.generate(
#     past_values=batch["past_values"],
#     past_time_features=batch["past_time_features"],
#     past_observed_mask=batch["past_observed_mask"],
#     static_categorical_features=batch["static_categorical_features"],
#     static_real_features=batch["static_real_features"],
#     future_time_features=batch["future_time_features"],
# )

# mean_prediction = outputs.sequences.mean(dim=1)

# model = XGBClassifier(random_state=1, learning_rate=.1, n_estimators=200)
# predictions = backtest(historical_data, model, predictors)

# precision_score(predictions["target"], predictions["predictions"])

In [18]:
# TODO: [ ] add predictor/s for sentiment from tweets, google trends, stock news
# TODO: [x] add predictor/s for related stock tickers (e.g. SPY, TSLA, QQQ, APL)
# TODO: [x] add predictor/s for cryptocurrency tickers (e.g. DOGE-USD, BTC-USD, ETH-USD)
# TODO: [ ] add transformer model/s
# TODO: [ ] add prediction for valuation of stock options during intra-period contracts