In [24]:
import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline
import hvplot.pandas as hv
import matplotlib.pyplot as plt

# Regression Analysis: Seasonal Effects with Sklearn Linear Regression
In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with *lagged* Yen futures returns. 

In [2]:
# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1976-08-02,3398.0,3401.0,3398.0,3401.0,,3401.0,2.0,1.0
1976-08-03,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-04,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-05,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-06,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0


In [3]:
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1990-01-02,6954.0,6954.0,6835.0,6847.0,,6847.0,48336.0,51473.0
1990-01-03,6877.0,6910.0,6865.0,6887.0,,6887.0,38206.0,53860.0
1990-01-04,6937.0,7030.0,6924.0,7008.0,,7008.0,49649.0,55699.0
1990-01-05,6952.0,6985.0,6942.0,6950.0,,6950.0,29944.0,53111.0
1990-01-08,6936.0,6972.0,6936.0,6959.0,,6959.0,19763.0,52072.0


# Data Preparation

### Returns

In [4]:
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s
# YOUR CODE HERE!
settle_price = (yen_futures["Settle"].pct_change()*100)
settle_price = settle_price.dropna()
settle_price.head()

Date
1990-01-03    0.584197
1990-01-04    1.756933
1990-01-05   -0.827626
1990-01-08    0.129496
1990-01-09   -0.632275
Name: Settle, dtype: float64

### Lagged Returns 

In [5]:
# Create a lagged return using the shift function
# YOUR CODE HERE!
yen_futures["Return"] = (yen_futures["Settle"].pct_change()*100)
yen_futures["Lag"] = yen_futures["Settle"].shift() 
yen_futures = yen_futures.dropna()
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest,Return,Lag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-02-18,9831.0,9865.0,9734.0,9775.0,42.0,9775.0,203495.0,196924.0,-0.427829,9817.0
2014-02-19,9768.0,9825.0,9760.0,9773.0,2.0,9773.0,129508.0,197197.0,-0.02046,9775.0
2014-02-20,9774.0,9837.0,9765.0,9775.0,2.0,9775.0,160202.0,198280.0,0.020465,9773.0
2014-02-21,9772.0,9776.0,9725.0,9758.0,20.0,9755.0,103091.0,202990.0,-0.204604,9775.0
2014-02-24,9752.0,9789.0,9740.0,9757.0,2.0,9757.0,90654.0,203114.0,0.020502,9755.0


### Train Test Split

In [6]:
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = yen_futures[:'2017']
test = yen_futures['2018':]
test.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest,Return,Lag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-02,8909.5,8957.5,8898.5,8938.0,26.5,8940.5,96714.0,227884.0,0.297285,8914.0
2018-01-03,8943.0,8947.5,8913.0,8921.0,21.5,8919.0,93498.0,226582.0,-0.240479,8940.5
2018-01-04,8917.0,8920.5,8891.0,8901.0,19.0,8900.0,115434.0,224918.0,-0.213028,8919.0
2018-01-05,8897.0,8902.0,8854.0,8878.0,31.5,8868.5,133023.0,229326.0,-0.353933,8900.0
2018-01-08,8870.5,8889.0,8849.5,8872.5,5.5,8874.0,81647.0,237100.0,0.062017,8868.5


# --------------------------------------------------

In [7]:
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
# YOUR CODE HERE!
X_train = train["Lag"].to_frame()
X_test = test["Lag"].to_frame()
y_train = train["Return"].to_frame()
y_test = test["Return"]

# Linear Regression Model

In [8]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
# YOUR CODE HERE!
model_regression = LinearRegression()
model_regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

In [9]:
(model_regression.predict(X_test)*100)

array([[-5.68987892e-01],
       [-7.31528395e-01],
       [-5.99655911e-01],
       [-4.83117437e-01],
       [-2.89908915e-01],
       [-3.23643736e-01],
       [-5.35253070e-01],
       [-1.17008107e+00],
       [-1.29888676e+00],
       [-1.30808716e+00],
       [-1.68223700e+00],
       [-1.27435234e+00],
       [-1.32342117e+00],
       [-1.51969650e+00],
       [-1.31728757e+00],
       [-1.65156898e+00],
       [-2.29866419e+00],
       [-2.10238887e+00],
       [-2.48573911e+00],
       [-2.34466622e+00],
       [-2.43053667e+00],
       [-2.25572896e+00],
       [-2.09012166e+00],
       [-1.65770258e+00],
       [-1.94291516e+00],
       [-2.12078968e+00],
       [-2.08398805e+00],
       [-2.35079982e+00],
       [-2.51027353e+00],
       [-2.44893749e+00],
       [-2.95802661e+00],
       [-3.28310762e+00],
       [-3.71552669e+00],
       [-3.68179187e+00],
       [-3.16043554e+00],
       [-2.88749016e+00],
       [-3.48858335e+00],
       [-3.43338091e+00],
       [-3.3

In [13]:
# Make a prediction of "y" values using just the test dataset
# YOUR CODE HERE!
y_predictions = model_regression.predict(X_test)

In [14]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
# YOUR CODE HERE!
predictions = y_test.to_frame()
predictions["Prediction"]=y_predictions
predictions.head()

Unnamed: 0_level_0,Return,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,0.297285,-0.00569
2018-01-03,-0.240479,-0.007315
2018-01-04,-0.213028,-0.005997
2018-01-05,-0.353933,-0.004831
2018-01-08,0.062017,-0.002899


In [26]:
# Plot the first 20 predictions vs the true values
# YOUR CODE HERE!
re = predictions["Return"][:20].hvplot(title="Actual Return", shared_axes=False)
pre = predictions["Prediction"][:20].hvplot(title="Predicted Return", shared_axes=False)
re+pre


# Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)

In [28]:
from sklearn.metrics import mean_squared_error
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
# YOUR CODE HERE!
mse = mean_squared_error(predictions["Return"],predictions["Prediction"])
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
# YOUR CODE HERE!
rmse = np.sqrt(mse)
rmse

0.4150869653618711

# In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

In [31]:
# Construct a dataframe using just the "y" training data:
# YOUR CODE HERE!
in_sample_df = y_train
# Add a column of "in-sample" predictions to that dataframe:  
# YOUR CODE HERE!
in_sample_df["In Sample Predictions"] = model_regression.predict(X_train)
# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
# YOUR CODE HERE!
in_sample_mse = mean_squared_error(in_sample_df["Return"],in_sample_df["In Sample Predictions"])
# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
# YOUR CODE HERE!
in_sample_rmse = np.sqrt(in_sample_mse)

In [32]:
in_sample_rmse

0.5951894086826469

In [33]:
in_sample_df

Unnamed: 0_level_0,Return,In Sample Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-02-18,-0.427829,-0.061076
2014-02-19,-0.020460,-0.058500
2014-02-20,0.020465,-0.058378
2014-02-21,-0.204604,-0.058500
2014-02-24,0.020502,-0.057273
...,...,...
2017-12-22,0.118370,-0.003022
2017-12-26,0.118230,-0.003666
2017-12-27,-0.151830,-0.004310
2017-12-28,0.180221,-0.003482


# Conclusions

YOUR CONCLUSIONS HERE!