In [59]:
# Import
import pandas as pd
import numpy as np
import hvplot.pandas
import lib
from pathlib import Path

from pandas.tseries.offsets import DateOffset


# Import LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Setting these options will allow for reviewing more of the DataFrames
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 1000)

In [60]:
# Import the sp500 dataset into a Pandas Dataframe
trading_df = pd.read_csv(
    Path("../data/SP500_Data.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)
# Review the DataFrame
trading_df.head()

Unnamed: 0_level_0,Price,Open,High,Low,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-07-03,954.0,948.1,954.3,934.9,0.62%
2002-07-05,989.0,965.8,989.1,954.0,3.67%
2002-07-08,977.0,989.0,993.6,972.9,-1.21%
2002-07-09,952.8,977.0,979.6,951.7,-2.48%
2002-07-10,920.5,952.8,956.3,920.3,-3.39%


In [61]:
# Calculate the daily returns using the closing prices and the pct_change function
trading_df["actual_returns"] = trading_df["Price"].pct_change()

# Drop all NaN values from the DataFrame
trading_df = trading_df.dropna()

In [62]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
X_train = pd.read_csv(
    Path("../data/X_long_train.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
X_train.head()

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-08-09,878.241667,890.088462
2002-08-12,883.666667,886.811538
2002-08-13,886.283333,883.242308
2002-08-14,888.0,881.965385
2002-08-15,890.283333,882.338462


In [63]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
X_test = pd.read_csv(
    Path("../data/X_long_test.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
X_test.head()


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-08-09,1273.070833,1263.294615
2006-08-10,1273.315,1263.329231
2006-08-11,1273.176667,1263.046923
2006-08-14,1273.594167,1263.151923
2006-08-15,1274.18,1263.853462


In [64]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
y_train = pd.read_csv(
    Path("../data/y_long_train.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
y_train.head()

Unnamed: 0_level_0,signal
Date,Unnamed: 1_level_1
2002-08-09,1.0
2002-08-12,-1.0
2002-08-13,-1.0
2002-08-14,1.0
2002-08-15,1.0


In [65]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
y_test = pd.read_csv(
    Path("../data/y_long_test.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
y_test.head()

Unnamed: 0_level_0,signal
Date,Unnamed: 1_level_1
2006-08-09,-1.0
2006-08-10,1.0
2006-08-11,-1.0
2006-08-14,1.0
2006-08-15,1.0


In [66]:
# Create an instance of the LogisticRegression model
logistic_regression_model = LogisticRegression(max_iter=500)

In [67]:
# Create a StandardScaler instance
scaler = StandardScaler()
 
# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)
 
# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [68]:
# Fit the LogisticRegression model
logistic_regression_model.fit(X_train_scaled, y_train.values.ravel())

LogisticRegression(max_iter=500)

In [69]:
# Use the trained LogisticRegression model to predict the trading signals for the training data
lr_training_signal_predictions = logistic_regression_model.predict(X_train_scaled)

# Display the predictions
lr_training_signal_predictions

array([1., 1., 1., ..., 1., 1., 1.])

In [70]:
# Generate a classification report using the training data and the logistic regression model's predications
lr_training_report = classification_report(y_train.values.ravel(), lr_training_signal_predictions)

# Review the classification report
print(lr_training_report)

              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00       465
         1.0       0.54      1.00      0.70       543

    accuracy                           0.54      1008
   macro avg       0.27      0.50      0.35      1008
weighted avg       0.29      0.54      0.38      1008



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
# Use the trained model to predict the trading signals for the testing data.
lr_testing_signal_predictions = logistic_regression_model.predict(X_test_scaled)

In [72]:
# Generate a classification report using the testing data and the logistic regression model's predictions
lr_testing_report = classification_report(y_test.values.ravel(), lr_testing_signal_predictions)

# Review the testing classification report
print(lr_testing_report)

              precision    recall  f1-score   support

        -1.0       0.48      0.01      0.02      1790
         1.0       0.55      0.99      0.71      2177

    accuracy                           0.55      3967
   macro avg       0.52      0.50      0.36      3967
weighted avg       0.52      0.55      0.40      3967



In [73]:
predictions_df = pd.DataFrame(index=X_test.index)
predictions_df["predicted_signal"] = lr_testing_signal_predictions
predictions_df["actual_returns"] = trading_df["actual_returns"]
# Calculate the  strategy returns
predictions_df["strategy_returns"] = predictions_df["actual_returns"] * predictions_df["predicted_signal"]
# Calculate the cumulative returns
predictions_df["cumulative_Returns"] = (
    1 + predictions_df["strategy_returns"]
).cumprod() - 1

predictions_df.head()

Unnamed: 0_level_0,predicted_signal,actual_returns,strategy_returns,cumulative_Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-08-09,1.0,-0.004349,-0.004349,-0.004349
2006-08-10,1.0,0.004629,0.004629,0.00026
2006-08-11,1.0,-0.003986,-0.003986,-0.003728
2006-08-14,1.0,0.00116,0.00116,-0.002572
2006-08-15,1.0,0.013696,0.013696,0.011089


In [74]:
(1 + predictions_df[["actual_returns", "strategy_returns"]]).cumprod().hvplot()

In [75]:
evaluation_df = lib.performance_metrics(predictions_df, "Strategy_LR_L_Term")
evaluation_df.to_csv("../data/Strategy_LR_L_Term_Metrics.csv")
predictions_df.to_csv("../data/Strategy_LR_L_Term_Returns.csv")
evaluation_df.head()

Unnamed: 0,Strategy_LR_L_Term
Annualized Return,0.069898
Cumulative Returns,1.170824
Annual Volatility,8.007493
Sharpe Ratio,0.34548
Sortino Ratio,0.462093


In [76]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
y_train = pd.read_csv(
    Path("../data/y_short_train.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
y_train.head()

Unnamed: 0_level_0,signal
Date,Unnamed: 1_level_1
2014-05-12,1.0
2014-05-13,1.0
2014-05-14,-1.0
2014-05-15,-1.0
2014-05-16,1.0


In [77]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
X_train = pd.read_csv(
    Path("../data/X_short_train.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
X_train.head()

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-05-12,1878.44,1866.558077
2014-05-13,1881.2775,1867.802692
2014-05-14,1882.869167,1869.475385
2014-05-15,1882.245833,1870.201923
2014-05-16,1881.738333,1870.420385


In [78]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
X_test = pd.read_csv(
    Path("../data/X_short_test.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
X_test.head()

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-11,2392.269167,2371.334231
2017-05-12,2392.556667,2372.793846
2017-05-15,2393.685833,2374.518077
2017-05-16,2395.058333,2376.253846
2017-05-17,2392.45,2376.248846


In [79]:
# Read in CSV file in from the resources folder into a Pandas DataFrame
# Set the date as the DateTimeIndex
y_test = pd.read_csv(
    Path("../data/y_short_test.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)

# Review the DataFrame
y_test.head()

Unnamed: 0_level_0,signal
Date,Unnamed: 1_level_1
2017-05-11,-1.0
2017-05-12,-1.0
2017-05-15,1.0
2017-05-16,-1.0
2017-05-17,-1.0


In [80]:

# Create a StandardScaler instance
scaler = StandardScaler()
 
# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)
 
# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [81]:
logistic_regression_model = LogisticRegression(max_iter=500)
logistic_regression_model.fit(X_train_scaled, y_train.values.ravel())

LogisticRegression(max_iter=500)

In [82]:
lr_testing_signal_predictions = logistic_regression_model.predict(X_test_scaled)

In [83]:
# Generate a classification report using the testing data and the logistic regression model's predictions
lr_testing_report = classification_report(y_test.values.ravel(), lr_testing_signal_predictions)

# Review the testing classification report
print(lr_testing_report)

              precision    recall  f1-score   support

        -1.0       0.44      1.00      0.61       556
         1.0       0.00      0.00      0.00       704

    accuracy                           0.44      1260
   macro avg       0.22      0.50      0.31      1260
weighted avg       0.19      0.44      0.27      1260



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [84]:
predictions_df = pd.DataFrame(index=X_test.index)
predictions_df["predicted_signal"] = lr_testing_signal_predictions
predictions_df["actual_returns"] = trading_df["actual_returns"]
# Calculate the  strategy returns
predictions_df["strategy_returns"] = predictions_df["actual_returns"] * predictions_df["predicted_signal"]
# Calculate the cumulative returns
predictions_df["cumulative_Returns"] = (
    1 + predictions_df["strategy_returns"]
).cumprod() - 1

predictions_df.head()

Unnamed: 0_level_0,predicted_signal,actual_returns,strategy_returns,cumulative_Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-11,-1.0,-0.002163,0.002163,0.002163
2017-05-12,-1.0,-0.001478,0.001478,0.003644
2017-05-15,-1.0,0.004776,-0.004776,-0.001149
2017-05-16,-1.0,-0.000687,0.000687,-0.000463
2017-05-17,-1.0,-0.018178,0.018178,0.017706


In [85]:
(1 + predictions_df[["actual_returns", "strategy_returns"]]).cumprod().hvplot()

In [86]:
evaluation_df = lib.performance_metrics(predictions_df, "Strategy_LR_S_Term")
evaluation_df.to_csv("../data/Strategy_LR_S_Term_Metrics.csv")
evaluation_df.head()

Unnamed: 0,Strategy_LR_S_Term
Annualized Return,-0.119385
Cumulative Returns,-0.502544
Annual Volatility,2.867556
Sharpe Ratio,-0.592263
Sortino Ratio,-0.867985


In [87]:
predictions_df.to_csv("../data/Strategy_LR_S_Term_Returns.csv")