In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

daily_treasury_yield_curve_rate_df = pd.read_csv('daily_treasury_yield_curve_rate.csv',
                                                 parse_dates=['Date'], index_col='Date')

In [2]:
PERIODS = ['3 Mo','1 Yr','5 Yr','10 Yr','30 Yr']
FONT_SIZE = 10
FIGSIZE = (10,5)
START_DATE='1990-01-01'
END_DATE = '2016-01-01'

In [3]:
def prepare_data(df, interest_periods, data_source,
                 start_date='1990-01-01', end_date = '2020-01-01',
                 fill=False, add_freq=False, freq='3m'):
    
    if data_source == 'yield':
        data = df[interest_periods]
    elif data_source == 'baa10y':
        data = df[interest_periods]
        data = data.replace('.',np.nan)
        data = data.dropna()
        data = data.astype(float)
        
    data = data.loc[start_date:end_date]
    data = data.sort_index(axis=0)

    if add_freq:
        data = data.asfreq(freq=freq)

    if fill:
        data = data.fillna(method='ffill')

    return data

# Regression - next day prediction

# Features

In [14]:
def prepare_features(data,prediction_time):
    rolling_avg_week = data.rolling(window=7).mean()
    rolling_avg_month = data.rolling(window=30).mean()
    rolling_avg_year = data.rolling(window=365).mean()
    rolling_std_week = data.rolling(window=7).std()
    rolling_std_month = data.rolling(window=30).std()
    rolling_std_year = data.rolling(window=365).std()
    shift_1 = data.shift(1)
    shift_2 = data.shift(2)
    shift_3 = data.shift(3)
    shift_4 = data.shift(4)
    shift_5 = data.shift(5)
    shift_6 = data.shift(6)
    shift_7 = data.shift(7)
    y = data.shift(-prediction_time)

    data = pd.DataFrame(data={'current_day':data.values,
                            'rolling_avg_week':rolling_avg_week,
                             'rolling_avg_month':rolling_avg_month,
                             'rolling_avg_year':rolling_avg_year,
                             'rolling_std_week':rolling_std_week,
                             'rolling_std_month':rolling_std_month,
                             'rolling_std_year':rolling_std_year,
                             'y':y}
                        ,index = data.index)
    data = data.dropna()
    return data

In [19]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

data = prepare_data(daily_treasury_yield_curve_rate_df, interest_periods=PERIODS[0], data_source='yield')
prediction_time=365
data = prepare_features(data,prediction_time)
train, test = data.iloc[:-prediction_time],data.iloc[-prediction_time:]
X = train.drop(columns=['y'])
y = train['y']

tscv = TimeSeriesSplit(n_splits=5)
for train_index, val_index in tscv.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    reg = LinearRegression().fit(X_train, y_train)
    print(mean_squared_error(y_val,reg.predict(X_val)))
    print(reg.coef_)

0.4108551115492863
[ 0.02082776 -0.08802922  0.47529901 -0.68749925 -1.55919508 -1.41095708
 -1.30106267]
5.488047796213989
[ 0.06333679 -0.38755276  0.78063619 -0.69850797 -0.81425059  0.19334418
 -1.16573066]
4.025414240400022
[ 0.63534139  0.22120487 -0.60186361 -0.41948835 -0.1999534  -1.07186462
 -2.36968818]
10.050176585696759
[ 1.17922749  1.00733131 -1.6188813  -0.32592975  0.44677573  0.45100547
 -1.62530663]
2.5450651160889457
[ 1.49483511  1.41323531 -2.16791853 -0.24235938 -0.27395183 -0.14350536
 -0.8371152 ]
