In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os as os

# Plot settings.  Cufflinks library helps to work with interactive Plot.ly plots, ready for the web
import cufflinks as cf
cf.set_config_file(offline=True)

# scikit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [None]:
# Check working directory
os.getcwd() 

# Set working directory
work_dir = "INSERT PATH TO FILE LOCATION"
os.chdir(work_dir)

In [None]:
data = pd.read_csv('./data/hjm_pca_2002-07.csv', index_col=0, sep ='\t')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# Plot curve
data.iloc[0].iplot(title = 'Representation of a Yield Curve')

In [None]:
# Plot all curves
data.T.iplot(title='Daily Yield Curves')

In [None]:
diff_ = data.diff(-1)
diff_.dropna(inplace=True)

In [None]:
diff_.tail()

In [None]:
diff_.shape

In [None]:
vol = np.std(diff_, axis=0) * 10000

In [None]:
vol[:21].iplot(title='Volatility of daily UK government yields', xTitle='Tenor', yTitle='Volatility (bps)',
         color='cornflowerblue')

In [None]:
cov_= pd.DataFrame(np.cov(diff_, rowvar=False)*252/10000, columns=diff_.columns, index=diff_.columns)
cov_.style.format("{:.4%}")

In [None]:
# Perform eigen decomposition
eigenvalues, eigenvectors = np.linalg.eig(cov_)

# Sort values (good practice)
idx = eigenvalues.argsort()[::-1]   
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]

# Format into a DataFrame 
df_eigval = pd.DataFrame({"Eigenvalues": eigenvalues})

eigenvalues

In [None]:
# Work out explained proportion 
df_eigval["Explained proportion"] = df_eigval["Eigenvalues"] / np.sum(df_eigval["Eigenvalues"])
df_eigval = df_eigval[:10]

#Format as percentage
df_eigval.style.format({"Explained proportion": "{:.2%}"})

In [None]:
(df_eigval['Explained proportion'][:10]*100).iplot(kind='bar', 
                                             title='Percentage of overall variance explained', 
                                             color='cornflowerblue')

In [None]:
# Subsume first 3 components into a dataframe
pcadf = pd.DataFrame(eigenvectors[:,0:3], columns=['PC1','PC2','PC3'])
pcadf[:10]

In [None]:
pcadf.iplot(title='First Three Principal Components', secondary_y='PC1', secondary_y_title='PC1', 
            yTitle='change in yield (bps)')

In [None]:
# Import Bank of England spot curve data from excel
df = pd.read_excel("../data/GLC Nominal month end data_1970 to 2015.xlsx", 
                   index_col=0, header=3, sheet_name="4. spot curve", skiprows=[4])

# Select all of the data up to 10 years
df = df.iloc[:,0:20]

df.head()

In [None]:
# Drop nan values
df = df.dropna(how="any")
df.shape

In [None]:
# Standarized data
scaler = StandardScaler()
scaler.fit(df)

df1 = pd.DataFrame(scaler.transform(df))
df1.head()

In [None]:
# Create a covariance matrix 
cov_matrix_array = np.cov(df1, rowvar=False)
pd.DataFrame(cov_matrix_array) #, index=range(1,21), columns=range(1,21))

In [None]:
# Perform eigen decomposition

eigenvalues, eigenvectors = np.linalg.eig(cov_matrix_array)

# Sort values (good practice)
idx = eigenvalues.argsort()[::-1]   
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]

# Format into a DataFrame 
df_eigval = pd.DataFrame({"Eigenvalues": eigenvalues}) #, index=range(1,21))

eigenvalues

In [None]:
# Format into a DataFrame 
df_eigvec = pd.DataFrame(eigenvectors) #, index=range(1,21))

eigenvectors[:,0]

In [None]:
# Work out explained proportion 
df_eigval["Explained proportion"] = df_eigval["Eigenvalues"] / np.sum(df_eigval["Eigenvalues"])

#Format as percentage
df_eigval.style.format({"Explained proportion": "{:.2%}"})

In [None]:
(df_eigval['Explained proportion'][:10]*100).iplot(kind='bar', 
                                             title='Percentage of overall variance explained', 
                                             color='cornflowerblue')

In [None]:
# Subsume first 3 components into a dataframe
pcdf = pd.DataFrame(eigenvectors[:,0:3], columns=['PC1','PC2','PC3'])
pcdf[:10]

In [None]:
pcdf.iplot(title='First Three Principal Components', secondary_y='PC1', secondary_y_title='PC1')

In [None]:
# Scale and fit the model
pipe = Pipeline([("scaler", StandardScaler()), ("pca", PCA())]) 
pipe.fit(df)

In [None]:
# eigenvectors
pipe['pca'].components_[0]

In [None]:
# eigen values
pipe['pca'].explained_variance_

In [None]:
# eigen values proportion
pipe['pca'].explained_variance_ratio_

In [None]:
df2 = pd.DataFrame({'Eigenvalues': pipe['pca'].explained_variance_,
                    'Explained proportion': pipe['pca'].explained_variance_ratio_})
#Format as percentage
df2.style.format({"Explained proportion": "{:.2%}"})

In [None]:
# Dot product below 'projects' principal components, onto the scaled dataframe df1 (tenors x curves)

df1_projections = df1.dot(eigenvectors)
df1_projections.index = df.index
df1_projections.head()

In [None]:
df1_projections.shape

In [None]:
level = pd.DataFrame({'10Y': df[2.0],
                  'PC1': df1_projections[0]})
level.head()

In [None]:
level.iplot(title='PC1 Projection vs 10Y Yield', secondary_y='PC1 Projection')

In [None]:
# Calculate 10Y-2Y, typical measure of slope
slope = pd.DataFrame(df)
slope = slope[[2,10]]
slope['slope'] = slope[10] - slope[2]
slope['pc2_projection'] = df1_projections[1]
slope.head()

In [None]:
slope[['slope', 'pc2_projection']].iplot(title='PC2 Projection vs 10Y-2Y Slope', secondary_y='PC2 Projection')

In [None]:
# Verify the correlation
np.corrcoef(df1_projections[1], slope['slope'])