# Data Preparation for a Machine Learning Trading Strategy

## Loading and Preparing the Data

In [1]:
# Import required libraries
import pandas as pd
from pathlib import Path

In [5]:
# Import the OHLCV dataset into a Pandas Dataframe
trading_df = pd.read_csv(
    Path("Resources/Week15-Day3-Activity4-ohlcv.csv"), 
    index_col="date", 
    infer_datetime_format=True, 
    parse_dates=True
)

# Display sample data
display(trading_df.head())
display(trading_df.tail())

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-10-19 09:30:00,16.9,17.18,16.9,17.095,11522
2018-10-19 09:45:00,17.11,17.44,17.11,17.4,70593
2018-10-19 10:00:00,17.4,17.4,17.25,17.28,38885
2018-10-19 10:15:00,17.27,17.27,17.18,17.2,37046
2018-10-19 10:30:00,17.21,17.37,17.19,17.2,46874


Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-04 14:45:00,6.225,6.26,6.22,6.25,55512
2020-09-04 15:00:00,6.255,6.27,6.245,6.25,65810
2020-09-04 15:15:00,6.25,6.29,6.25,6.275,202630
2020-09-04 15:30:00,6.27,6.28,6.25,6.255,130140
2020-09-04 15:45:00,6.25,6.28,6.25,6.25,190278


In [29]:
# Calculate the daily returns using the closing prices and the pct_change function
trading_df["actual_returns"] = trading_df["close"].pct_change()

# Display sample data
trading_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,actual_returns,sma_fast,sma_slow,signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-10-24 15:00:00,15.66,15.66,15.6,15.615,80027,,15.6525,16.3403,-1.0
2018-10-24 15:15:00,15.63,15.66,15.52,15.53,76449,-0.005443,15.61875,16.3216,-1.0
2018-10-24 15:30:00,15.54,15.54,15.18,15.41,137468,-0.007727,15.55375,16.3029,-1.0
2018-10-24 15:45:00,15.41,15.42,15.35,15.35,688995,-0.003894,15.47625,16.2844,-1.0
2018-10-25 09:30:00,15.55,15.55,15.205,15.32,44387,-0.001954,15.4025,16.2656,-1.0


In [30]:
# Drop all NaN values from the DataFrame
trading_df = trading_df.dropna()

# Review the DataFrame
display(trading_df.head())
display(trading_df.tail())

Unnamed: 0_level_0,open,high,low,close,volume,actual_returns,sma_fast,sma_slow,signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-10-24 15:15:00,15.63,15.66,15.52,15.53,76449,-0.005443,15.61875,16.3216,-1.0
2018-10-24 15:30:00,15.54,15.54,15.18,15.41,137468,-0.007727,15.55375,16.3029,-1.0
2018-10-24 15:45:00,15.41,15.42,15.35,15.35,688995,-0.003894,15.47625,16.2844,-1.0
2018-10-25 09:30:00,15.55,15.55,15.205,15.32,44387,-0.001954,15.4025,16.2656,-1.0
2018-10-25 09:45:00,15.35,15.36,15.21,15.24,67733,-0.005222,15.33,16.2468,-1.0


Unnamed: 0_level_0,open,high,low,close,volume,actual_returns,sma_fast,sma_slow,signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-09-04 14:45:00,6.225,6.26,6.22,6.25,55512,0.00321,6.22875,6.2703,1.0
2020-09-04 15:00:00,6.255,6.27,6.245,6.25,65810,0.0,6.23875,6.26985,1.0
2020-09-04 15:15:00,6.25,6.29,6.25,6.275,202630,0.004,6.25125,6.2691,1.0
2020-09-04 15:30:00,6.27,6.28,6.25,6.255,130140,-0.003187,6.2575,6.26855,-1.0
2020-09-04 15:45:00,6.25,6.28,6.25,6.25,190278,-0.000799,6.2575,6.26785,-1.0


## Generating the Features and Target Sets

### Creating the Features Set

In [8]:
# Define a window size of 4
short_window = 4

# Create an SMA that uses short_window, and assign it to a new column named “sma_fast”
trading_df["sma_fast"] = trading_df["close"].rolling(window=short_window).mean()

In [9]:
# Define a window size of 100
long_window = 100

# Create an SMA that uses long_window, and assign it to a new columns named “sma_slow”
trading_df["sma_slow"] = trading_df["close"].rolling(window=long_window).mean()

In [11]:
# Drop the NaNs using dropna()
trading_df = trading_df.dropna()
trading_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,actual_returns,sma_fast,sma_slow
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-10-24 15:00:00,15.66,15.66,15.6,15.615,80027,-0.002874,15.6525,16.3403
2018-10-24 15:15:00,15.63,15.66,15.52,15.53,76449,-0.005443,15.61875,16.3216
2018-10-24 15:30:00,15.54,15.54,15.18,15.41,137468,-0.007727,15.55375,16.3029
2018-10-24 15:45:00,15.41,15.42,15.35,15.35,688995,-0.003894,15.47625,16.2844
2018-10-25 09:30:00,15.55,15.55,15.205,15.32,44387,-0.001954,15.4025,16.2656


In [12]:
# Assign a copy of the `sma_fast` and `sma_slow` columns to a new DataFrame called `X`
X = trading_df[["sma_fast", "sma_slow"]].shift().dropna().copy()

# Display sample data
display(X.head())
display(X.tail())

Unnamed: 0_level_0,sma_fast,sma_slow
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-10-24 15:15:00,15.6525,16.3403
2018-10-24 15:30:00,15.61875,16.3216
2018-10-24 15:45:00,15.55375,16.3029
2018-10-25 09:30:00,15.47625,16.2844
2018-10-25 09:45:00,15.4025,16.2656


Unnamed: 0_level_0,sma_fast,sma_slow
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-09-04 14:45:00,6.22625,6.2721
2020-09-04 15:00:00,6.22875,6.2703
2020-09-04 15:15:00,6.23875,6.26985
2020-09-04 15:30:00,6.25125,6.2691
2020-09-04 15:45:00,6.2575,6.26855


### Creating the Target Set

In [31]:
# Create a new column in the `trading_df` called "signal" setting its value to zero.
trading_df["signal"] = 0.0

In [14]:
# Create the signal to buy
trading_df.loc[(trading_df["actual_returns"] >= 0), "signal"] = 1

In [15]:
# Create the signal to sell
trading_df.loc[(trading_df["actual_returns"] < 0), "signal"] = -1

In [16]:
# Copy the new "signal" column to a new Series called `y`.
y = trading_df["signal"].copy()

## Split the Data into Training and Testing Sets

In [18]:
# Import required libraries
from pandas.tseries.offsets import DateOffset

In [21]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)
print(X.index.max())

2018-10-24 15:15:00
2020-09-04 15:45:00


In [20]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2019-01-24 15:15:00


In [25]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

In [26]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

## Standardize the Data

In [27]:
# Import required libraries
from sklearn.preprocessing import StandardScaler

In [28]:
# Create a StandardScaler instance
scaler = StandardScaler()
 
# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)
 
# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)