In [None]:
# loading packages

import os

import pandas as pd
import numpy as np

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs

# Importing and initializing the PCA class from scikit learn and then fitting the data to the model.
from sklearn.decomposition import PCA

In [7]:
# load raw data
DATA_FOLDER = './'
raw = pd.read_csv(os.path.join(DATA_FOLDER, 'interest rates.csv'))

# check the raw data
print("Size of the dataset (row, col): ", raw.shape)
print("\nFirst 5 rows\n", raw.head(n=5))

Size of the dataset (row, col):  (10491, 8)

First 5 rows
        dates     2Y     3Y     5Y     7Y    10Y    30Y  US RET
0   1/5/1983 -0.035 -0.025  0.000  0.025  0.030  0.031   0.003
1   1/6/1983 -0.018  0.012 -0.008  0.000  0.010  0.024   0.019
2   1/7/1983 -0.108 -0.115 -0.072 -0.032 -0.031 -0.006  -0.001
3  1/10/1983 -0.035 -0.013 -0.016  0.006  0.005  0.021   0.010
4  1/11/1983 -0.036 -0.064 -0.048 -0.051 -0.046 -0.036  -0.006


In [26]:
# print summary statistics
print("\nSummary statistics\n", raw.describe())


Summary statistics
                  2Y            3Y            5Y            7Y           10Y  \
count  10491.000000  10491.000000  10491.000000  10491.000000  10491.000000   
mean      -0.000518     -0.000557     -0.000623     -0.000649     -0.000655   
std        0.062158      0.063386      0.066112      0.064495      0.062570   
min       -0.830000     -0.580000     -0.630000     -0.670000     -0.710000   
25%       -0.025000     -0.030000     -0.034000     -0.034000     -0.035000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.025000      0.028000      0.032000      0.031000      0.033000   
max        0.690000      0.580000      0.680000      0.577000      0.396000   

                30Y        US RET  
count  10491.000000  10491.000000  
mean      -0.000644      0.000397  
std        0.055848      0.010998  
min       -0.621000     -0.187000  
25%       -0.032000     -0.004000  
50%        0.000000      0.000000  
75%        0.02

In [14]:
# normalization
X = raw[['2Y','3Y', '5Y', '7Y','10Y','30Y']]
X = (X - X.mean()) / X.std()
print(X.head(5))

         2Y        3Y        5Y        7Y       10Y       30Y
0 -0.554750 -0.385622  0.009431  0.397687  0.489932  0.566600
1 -0.281252  0.198103 -0.111576  0.010059  0.170291  0.441260
2 -1.729185 -1.805491 -1.079627 -0.486105 -0.484973 -0.095910
3 -0.554750 -0.196306 -0.232582  0.103090  0.090381  0.387543
4 -0.570838 -1.000898 -0.716608 -0.780702 -0.724704 -0.633080


In [25]:
print("\nCorrelation matrix\n", X.corr())


Correlation matrix
            2Y        3Y        5Y        7Y       10Y       30Y
2Y   1.000000  0.906279  0.858600  0.760921  0.773135  0.631038
3Y   0.906279  1.000000  0.913537  0.828964  0.851215  0.704556
5Y   0.858600  0.913537  1.000000  0.873430  0.915338  0.780033
7Y   0.760921  0.828964  0.873430  1.000000  0.891952  0.802932
10Y  0.773135  0.851215  0.915338  0.891952  1.000000  0.881417
30Y  0.631038  0.704556  0.780033  0.802932  0.881417  1.000000


In [15]:
model = PCA(n_components=6).fit(X)
model

In [16]:
model.components_

array([[-0.39237251, -0.41467664, -0.42566495, -0.4107688 , -0.42306187,
        -0.38103031],
       [-0.5727105 , -0.37556262, -0.11118204,  0.17241876,  0.25871226,
         0.64956423],
       [ 0.33521178,  0.06082541, -0.08413615, -0.77602578, -0.00436208,
         0.5240414 ],
       [ 0.49681289, -0.19266175, -0.55796886,  0.44032675, -0.37575402,
         0.26391366],
       [-0.38416318,  0.80208174, -0.35997547,  0.03721316, -0.2371057 ,
         0.14797727],
       [-0.10618459, -0.05331606,  0.59871314,  0.06365753, -0.74609476,
         0.25829272]])

In [27]:
model.explained_variance_ratio_.round(4)

array([0.8554, 0.0778, 0.0273, 0.0182, 0.0121, 0.0092])

In [19]:
principalComponents = model.fit_transform(X)

In [20]:
pd.DataFrame(principalComponents).to_csv('PCAfactors.csv')

In [22]:
Y = raw[['2Y','3Y', '5Y', '7Y','10Y','30Y','US RET']]
Y = (Y - Y.mean()) / Y.std()
print(Y.head(5))

         2Y        3Y        5Y        7Y       10Y       30Y    US RET
0 -0.554750 -0.385622  0.009431  0.397687  0.489932  0.566600  0.236688
1 -0.281252  0.198103 -0.111576  0.010059  0.170291  0.441260  1.691498
2 -1.729185 -1.805491 -1.079627 -0.486105 -0.484973 -0.095910 -0.127015
3 -0.554750 -0.196306 -0.232582  0.103090  0.090381  0.387543  0.873167
4 -0.570838 -1.000898 -0.716608 -0.780702 -0.724704 -0.633080 -0.581643


In [24]:
pd.DataFrame(Y).to_csv('Y.csv')