# S&P500 historical 20year data processing 
### In this file we are performing data preparation for training and test datasets for algo trading models.

In [116]:
# Imports
import pandas as pd
from pathlib import Path

from pandas.tseries.offsets import DateOffset

In [117]:
# Import the sp500 dataset into a Pandas Dataframe
trading_df = pd.read_csv(
    Path("../data/SP500_Data.csv"), 
    index_col="Date",
    infer_datetime_format=True, 
    parse_dates=True
    
)
# Review the DataFrame
trading_df.head()

Unnamed: 0_level_0,Price,Open,High,Low,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-07-03,954.0,948.1,954.3,934.9,0.62%
2002-07-05,989.0,965.8,989.1,954.0,3.67%
2002-07-08,977.0,989.0,993.6,972.9,-1.21%
2002-07-09,952.8,977.0,979.6,951.7,-2.48%
2002-07-10,920.5,952.8,956.3,920.3,-3.39%


In [118]:
trading_df.shape

(5000, 5)

In [119]:
# Calculate the daily returns using the closing prices and the pct_change function
trading_df["actual_returns"] = trading_df["Price"].pct_change()

# Drop all NaN values from the DataFrame
trading_df = trading_df.dropna()

# Review the DataFrame
display(trading_df.head())
display(trading_df.tail())

Unnamed: 0_level_0,Price,Open,High,Low,Change %,actual_returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-07-05,989.0,965.8,989.1,954.0,3.67%,0.036688
2002-07-08,977.0,989.0,993.6,972.9,-1.21%,-0.012133
2002-07-09,952.8,977.0,979.6,951.7,-2.48%,-0.02477
2002-07-10,920.5,952.8,956.3,920.3,-3.39%,-0.0339
2002-07-11,927.4,920.5,929.2,900.9,0.75%,0.007496


Unnamed: 0_level_0,Price,Open,High,Low,Change %,actual_returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-05,4152.38,4255.5,4256.39,4105.51,-3.44%,-0.034368
2022-05-06,4123.34,4128.17,4157.69,4067.91,-0.70%,-0.006994
2022-05-09,3991.24,4081.27,4081.27,3975.48,-3.20%,-0.032037
2022-05-10,4001.05,4035.18,4068.82,3958.17,0.25%,0.002458
2022-05-11,3935.18,3990.08,4049.09,3928.82,-1.65%,-0.016463


In [120]:
# Define a window size of 4
short_window = 4

# Create a simple moving average (SMA) using the short_window and assign this to a new columns called sma_fast
trading_df["sma_fast"] = trading_df["Price"].rolling(window=short_window).mean()

In [121]:
# Define a window size of 50
long_window = 50

# Create a simple moving average (SMA) using the long_window and assign this to a new columns called sma_slow
trading_df["sma_slow"] = trading_df["Price"].rolling(window=long_window).mean()

In [122]:
trading_df.head()

Unnamed: 0_level_0,Price,Open,High,Low,Change %,actual_returns,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2002-07-05,989.0,965.8,989.1,954.0,3.67%,0.036688,,
2002-07-08,977.0,989.0,993.6,972.9,-1.21%,-0.012133,,
2002-07-09,952.8,977.0,979.6,951.7,-2.48%,-0.02477,,
2002-07-10,920.5,952.8,956.3,920.3,-3.39%,-0.0339,959.825,
2002-07-11,927.4,920.5,929.2,900.9,0.75%,0.007496,944.425,


In [123]:
# Drop the NaNs using dropna()
trading_df = trading_df.dropna()

In [124]:
# Assign a copy of the sma_fast and sma_slow columns to a new DataFrame called X
X = trading_df[["sma_fast", "sma_slow"]].copy()

# Display sample data
display(X.head())
display(X.tail())

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-09-13,898.95,902.556
2002-09-16,894.325,900.598
2002-09-17,885.325,898.528
2002-09-18,880.975,896.862
2002-09-19,869.35,895.318


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-05-05,4195.8525,4373.1098
2022-05-06,4187.8425,4369.8026
2022-05-09,4141.7825,4361.935
2022-05-10,4067.0025,4354.4802
2022-05-11,4012.7025,4347.0586


In [125]:
## Create the target set

# Create a new column in the trading_df called signal setting its value to zero.
trading_df["signal"] = 0.0

# Create the signal to buy
trading_df.loc[(trading_df["actual_returns"] >= 0), "signal"] = 1

# Create the signal to sell
trading_df.loc[(trading_df["actual_returns"] < 0), "signal"] = -1

# Copy the new signal column to a new Series called y.
y = trading_df["signal"].copy()

## Creating the Training Datasets

In [126]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

# Select the ending period for the training data with an offset of 4 years
training_end = X.index.min() + DateOffset(years=4)

# Display the training end date
print(training_end)

2002-09-13 00:00:00
2006-09-13 00:00:00


In [127]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Display sample data
display(X_train.head())
display(X_train.tail())

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-09-13,898.95,902.556
2002-09-16,894.325,900.598
2002-09-17,885.325,898.528
2002-09-18,880.975,896.862
2002-09-19,869.35,895.318


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-09-07,1304.635,1276.3348
2006-09-08,1301.6125,1277.3932
2006-09-11,1298.185,1277.9266
2006-09-12,1301.37,1278.7826
2006-09-13,1307.3825,1279.5402


In [128]:
display(y_train.head())
display(y_train.tail())

Date
2002-09-13    1.0
2002-09-16    1.0
2002-09-17   -1.0
2002-09-18   -1.0
2002-09-19   -1.0
Name: signal, dtype: float64

Date
2006-09-07   -1.0
2006-09-08    1.0
2006-09-11    1.0
2006-09-12    1.0
2006-09-13    1.0
Name: signal, dtype: float64

## Creating the long term testing Datasets

In [129]:
# Generate the X_long_test and y_long_test DataFrames

X_long_test = X.loc[training_end:]
y_long_test = y.loc[training_end:]

# Display sample data
# Display sample data
display(X_long_test.head())
display(X_long_test.tail())

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-09-13,1307.3825,1279.5402
2006-09-14,1311.7225,1280.4476
2006-09-15,1316.7525,1281.3592
2006-09-18,1318.7975,1282.4732
2006-09-19,1318.69,1283.4792


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-05-05,4195.8525,4373.1098
2022-05-06,4187.8425,4369.8026
2022-05-09,4141.7825,4361.935
2022-05-10,4067.0025,4354.4802
2022-05-11,4012.7025,4347.0586


In [130]:
display(y_long_test.head())
display(y_long_test.tail())

Date
2006-09-13    1.0
2006-09-14   -1.0
2006-09-15    1.0
2006-09-18    1.0
2006-09-19   -1.0
Name: signal, dtype: float64

Date
2022-05-05   -1.0
2022-05-06   -1.0
2022-05-09   -1.0
2022-05-10    1.0
2022-05-11   -1.0
Name: signal, dtype: float64

### Generate the 5 years data X_short_test and y_short_test DataFrames

In [131]:
short_testing_begin = X.index.max() - DateOffset(years=5)
short_testing_begin

Timestamp('2017-05-11 00:00:00')

In [132]:
X_short_test = X.loc[short_testing_begin:]
y_short_test = y.loc[short_testing_begin:]

# Display sample data
display(X_short_test.head())
display(X_short_test.tail())
display(y_short_test.head())
display(y_short_test.tail())

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-11,2397.5925,2368.2928
2017-05-12,2395.4725,2368.4724
2017-05-15,2396.8225,2368.8564
2017-05-16,2397.0825,2369.3636
2017-05-17,2387.73,2369.1364


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-05-05,4195.8525,4373.1098
2022-05-06,4187.8425,4369.8026
2022-05-09,4141.7825,4361.935
2022-05-10,4067.0025,4354.4802
2022-05-11,4012.7025,4347.0586


Date
2017-05-11   -1.0
2017-05-12   -1.0
2017-05-15    1.0
2017-05-16   -1.0
2017-05-17   -1.0
Name: signal, dtype: float64

Date
2022-05-05   -1.0
2022-05-06   -1.0
2022-05-09   -1.0
2022-05-10    1.0
2022-05-11   -1.0
Name: signal, dtype: float64

## Create train and test data files

In [133]:
X_train.to_csv("../data/X_train.csv")
y_train.to_csv("../data/y_train.csv")

X_long_test.to_csv("../data/X_long_test.csv")
y_long_test.to_csv("../data/y_long_test.csv")

X_short_test.to_csv("../data/X_short_test.csv")
y_short_test.to_csv("../data/y_short_test.csv")