In [122]:
# Necessary imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [164]:
class DataProcessor():
    def __init__(self,
                 data_path='data-SPY-20240617/yfinance_SPY.csv',
                 freq='D',
                 percent_train=0.7,
                 ):
        # Load data from data_path
        data = pd.read_csv(data_path, index_col=0, parse_dates=True)

        # Set the symbol for the data, default is SPY
        self.symbol = 'SPY'
        if 'symbol' in data.columns:
            self.symbol = data['symbol'].iloc[0]
            data.drop('symbol', axis=1, inplace=True)

        # Resample at the desired frequency, default is daily (only support daily, weekly, and monthly data for now)
        self.freq = freq
        if self.freq == 'D':
            self.data = data.copy()
        elif self.freq == 'M':
            self.data = data.resample('ME').last().copy()
        elif self.freq == 'W':
            self.data = data.resample('W').last().copy()

        self.cutoff = percent_train
    
    def generate_targets(self):
        m, _ = self.data.shape
        for i in [5, 10, 15]:
            self.data[f"target_{i}"] = self.data.close.shift(-i)
            self.data[f"target_{i}_label"] = self.data[f"target_{i}"] > self.data.close
            self.data[f"target_{i}_label"] = self.data[f"target_{i}_label"].astype(int)
            self.data[f"target_{i}_label"] = self.data[f"target_{i}_label"].replace({0: -1})
        if not os.path.exists('train'):
            os.makedirs('train')
        if not os.path.exists('test'):
            os.makedirs('test')
        train_test_cutoff = int(self.cutoff * m)
        train = self.data.iloc[:train_test_cutoff, :]
        gap = 15
        test = self.data.iloc[train_test_cutoff+gap:, :]

        train.to_csv(f"train/{self.freq}_train.csv")
        test.to_csv(f"test/{self.freq}_test.csv")


In [165]:
ohlc = DataProcessor()
ohlc.generate_targets()

In [166]:
df_train =pd.read_csv("train/D_train.csv")
df_test =pd.read_csv("test/D_test.csv")

columns  =["Date", "close", "target_5_label", "target_10_label", "target_15_label"]

df_train = df_train[columns]
df_test = df_test[columns]
df_train["Date"] = pd.to_datetime(df_train["Date"])
df_test["Date"] = pd.to_datetime(df_test["Date"])
df_train.sort_values("Date", inplace=True)
df_test.sort_values("Date", inplace=True)
df_train.set_index("Date", inplace=True)
df_test.set_index("Date", inplace=True)

In [167]:
df_train.dtypes

close              float64
target_5_label       int64
target_10_label      int64
target_15_label      int64
dtype: object

In [168]:
df_train.head()

Unnamed: 0_level_0,close,target_5_label,target_10_label,target_15_label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1993-04-26 00:00:00-04:00,24.579847,1,1,1
1993-04-27 00:00:00-04:00,24.845304,1,1,1
1993-04-28 00:00:00-04:00,24.792213,1,1,1
1993-04-29 00:00:00-04:00,24.898392,1,1,1
1993-04-30 00:00:00-04:00,24.93379,1,-1,1


In [195]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support

x_cols = ["close"]
y_cols = ["target_5_label"]
ndecimals = 5

def calculate_metrics(y_true, y_pred):
    # Calculate the accuracy
    accuracy = accuracy_score(y_true, y_pred)
    # Calculate precision, recall, and f1-score
    p_r_f1 = precision_recall_fscore_support(y_true, y_pred)
    # Store precision, recall, and f1-score in a dictionary
    metrics_dict = {
        'precision': np.around(p_r_f1[0], decimals=ndecimals),
        'recall': np.around(p_r_f1[1], decimals=ndecimals),
        'f1-score': np.around(p_r_f1[2], decimals=ndecimals),
        'accuracy': np.round(accuracy, decimals=ndecimals)
    }

    return metrics_dict

def fit_infer_logistic_regression(df_train, df_test, x_cols, y_cols):
    # Split the data into X and y
    X_train, y_train = df_train[x_cols], df_train[y_cols]
    X_test, y_test = df_test[x_cols], df_test[y_cols]
    # Initialize the Logistic Regression model
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)

    return calculate_metrics(y_test, y_pred)

def baseline_majority_model(df_train, df_test, x_cols, y_cols):
    # Get the majority class
    majority_class = df_train[y_cols].value_counts().idxmax()
    # Predict the majority class
    y_pred = np.full(df_test[y_cols].shape, majority_class)

    return calculate_metrics(df_test[y_cols], y_pred)

def baseline_model(df_train, df_test, x_cols, y_cols):
   
    y_pred = np.full(df_test[y_cols].shape, 1)

    return calculate_metrics(df_test[y_cols], y_pred)


In [196]:
def fit_infer_sequence_logistic_regression(df_train, df_test, x_col , y_cols, n=5):

    for i in range(1, n+1):
        df_train[f"close_-{i}"] = df_train[x_col].shift(i)
        df_test[f"close_-{i}"] = df_test[x_col].shift(i)
    
    x_cols = [f"close_-{i}" for i in range(1,n+1)] + ["close"]

    df_test.dropna(inplace=True)
    df_train.dropna(inplace=True)

    X_train, y_train = df_train[x_cols], df_train[y_cols]
    X_test, y_test = df_test[x_cols], df_test[y_cols]
    # Initialize the Logistic Regression model
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)

    return calculate_metrics(y_test, y_pred)


def format_df(df):
    df[['Precision_-1', 'Precision_+1']] = df['precision'].apply(pd.Series)
    df[['Recall_-1', 'Recall_+1']] = df['recall'].apply(pd.Series)
    df[['F1-Score_-1', 'F1-Score_+1']] = df['f1-score'].apply(pd.Series)
    df.drop(['precision', 'recall', 'f1-score'], axis=1, inplace=True)
    return df


In [None]:
lr_metrics = {}

for y_col in ["target_5_label", "target_10_label", "target_15_label"]:
    lr_metrics[y_col] = fit_infer_logistic_regression(df_train, df_test, x_cols, [y_col])

lr_metrics = pd.DataFrame(lr_metrics)
format_df(lr_metrics.T)
# fit_infer_logistic_regression(df_train, df_test, x_cols, ["target_5_label"])


Unnamed: 0,precision,recall,f1-score,accuracy
target_5_label,"[0.38434, 0.61614]","[0.4755, 0.525]","[0.42509, 0.56693]",0.50599
target_10_label,"[0.37625, 0.66542]","[0.45752, 0.58719]","[0.41292, 0.62386]",0.54149
target_15_label,"[0.33679, 0.69009]","[0.67755, 0.34987]","[0.44993, 0.46433]",0.45723


In [202]:
majority_class_metrics = {}

for y_col in ["target_5_label", "target_10_label", "target_15_label"]:
    majority_class_metrics[y_col] = baseline_majority_model(df_train, df_test, x_cols, [y_col])

majority_class_metrics = pd.DataFrame(majority_class_metrics)
format_df(majority_class_metrics.T)

Unnamed: 0,accuracy,Precision_-1,Precision_+1,Recall_-1,Recall_+1,F1-Score_-1,F1-Score_+1
target_5_label,0.61633,0.0,0.61633,0.0,1.0,0.0,0.76263
target_10_label,0.65013,0.0,0.65013,0.0,1.0,0.0,0.78798
target_15_label,0.6734,0.0,0.6734,0.0,1.0,0.0,0.80483


In [203]:
baseline_metrics = {}

for y_col in ["target_5_label", "target_10_label", "target_15_label"]:
    baseline_metrics[y_col] = baseline_model(df_train, df_test, x_cols, [y_col])

baseline_metrics = pd.DataFrame(majority_class_metrics)
format_df(baseline_metrics.T)

Unnamed: 0,accuracy,Precision_-1,Precision_+1,Recall_-1,Recall_+1,F1-Score_-1,F1-Score_+1
target_5_label,0.61633,0.0,0.61633,0.0,1.0,0.0,0.76263
target_10_label,0.65013,0.0,0.65013,0.0,1.0,0.0,0.78798
target_15_label,0.6734,0.0,0.6734,0.0,1.0,0.0,0.80483


In [204]:
lr_metrics = {}

for y_col in ["target_5_label", "target_10_label", "target_15_label"]:
    lr_metrics[y_col] = fit_infer_sequence_logistic_regression(df_train, df_test, "close", [y_col], n=10)

lr_metrics = pd.DataFrame(lr_metrics)
format_df(lr_metrics.T)

Unnamed: 0,accuracy,Precision_-1,Precision_+1,Recall_-1,Recall_+1,F1-Score_-1,F1-Score_+1
target_5_label,0.5097,0.379,0.61278,0.43563,0.55579,0.40535,0.5829
target_10_label,0.562,0.36962,0.66553,0.37292,0.66237,0.37127,0.66395
target_15_label,0.45907,0.32789,0.67938,0.63201,0.37574,0.43178,0.48387


In [192]:

format_df(lr_metrics.T)

Unnamed: 0,accuracy,Precision_-1,Precision_+1,Recall_-1,Recall_+1,F1-Score_-1,F1-Score_+1
target_5_label,0.50945,0.37359,0.6077,0.40782,0.57292,0.38996,0.5898
target_10_label,0.55565,0.36658,0.65567,0.36029,0.66178,0.36341,0.65871
target_15_label,0.45147,0.32558,0.66903,0.62963,0.36469,0.42922,0.47206
