## Import Libraries

In [1]:
import numpy as np
from numpy import where
import pandas as pd
import time

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor

pd.options.display.max_columns = None
pd.options.display.max_rows = None
np.set_printoptions(threshold=np.inf)

import warnings
warnings.simplefilter(action='ignore')

## Predict TV

In [2]:
df = pd.read_csv('/Users/user/Documents/project/trade-price-ir-vegas.csv')

In [3]:
df.head()

Unnamed: 0,Value Date,Trade Name,Trade Currency,Zero Rate Shock,TV,Expiry Bucket,Expiry Date,Tenor Bucket,Vega
0,2022-09-02,dummyTrade1,USD,-100,-227907.098775,1y,2023-09-04,10y,1.962246
1,2022-09-02,dummyTrade1,USD,-50,-222208.400967,1y,2023-09-04,10y,-3.812341
2,2022-09-02,dummyTrade1,USD,-25,-218960.927995,1y,2023-09-04,10y,4.471006
3,2022-09-02,dummyTrade1,USD,-10,-216872.430106,1y,2023-09-04,10y,4.333398
4,2022-09-02,dummyTrade1,USD,-5,-216146.310328,1y,2023-09-04,10y,5.679687


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919104 entries, 0 to 3919103
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Value Date       object 
 1   Trade Name       object 
 2   Trade Currency   object 
 3   Zero Rate Shock  int64  
 4   TV               float64
 5   Expiry Bucket    object 
 6   Expiry Date      object 
 7   Tenor Bucket     object 
 8   Vega             float64
dtypes: float64(2), int64(1), object(6)
memory usage: 269.1+ MB


In [5]:
df['Value Date'] = pd.to_datetime(df['Value Date'])
df['Expiry Date'] = pd.to_datetime(df['Expiry Date'])
df= df.drop(['Trade Name', 'Trade Currency'],axis = 1)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919104 entries, 0 to 3919103
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Value Date       datetime64[ns]
 1   Zero Rate Shock  int64         
 2   TV               float64       
 3   Expiry Bucket    object        
 4   Expiry Date      datetime64[ns]
 5   Tenor Bucket     object        
 6   Vega             float64       
dtypes: datetime64[ns](2), float64(2), int64(1), object(2)
memory usage: 209.3+ MB


Unnamed: 0,Value Date,Zero Rate Shock,TV,Expiry Bucket,Expiry Date,Tenor Bucket,Vega
0,2022-09-02,-100,-227907.098775,1y,2023-09-04,10y,1.962246
1,2022-09-02,-50,-222208.400967,1y,2023-09-04,10y,-3.812341
2,2022-09-02,-25,-218960.927995,1y,2023-09-04,10y,4.471006
3,2022-09-02,-10,-216872.430106,1y,2023-09-04,10y,4.333398
4,2022-09-02,-5,-216146.310328,1y,2023-09-04,10y,5.679687


In [6]:
df['Value Year'] = df['Value Date'].dt.year
df['Value Month'] = df['Value Date'].dt.month
df['Value Day'] = df['Value Date'].dt.day
df['Expiry Year'] = df['Expiry Date'].dt.year
df['Expiry Month'] = df['Expiry Date'].dt.month
df['Expiry Day'] = df['Expiry Date'].dt.day
df = df.drop(['Value Date', 'Expiry Date'],axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919104 entries, 0 to 3919103
Data columns (total 11 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Zero Rate Shock  int64  
 1   TV               float64
 2   Expiry Bucket    object 
 3   Tenor Bucket     object 
 4   Vega             float64
 5   Value Year       int32  
 6   Value Month      int32  
 7   Value Day        int32  
 8   Expiry Year      int32  
 9   Expiry Month     int32  
 10  Expiry Day       int32  
dtypes: float64(2), int32(6), int64(1), object(2)
memory usage: 239.2+ MB


In [7]:
df.describe()

Unnamed: 0,Zero Rate Shock,TV,Vega,Value Year,Value Month,Value Day,Expiry Year,Expiry Month,Expiry Day
count,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0
mean,6.25,-347139.5,17.14511,2022.798,7.473214,15.6131,2028.311,7.314484,16.09689
std,51.36167,185618.0,2850.144,0.4700498,3.474293,8.903535,4.410479,3.474583,8.879236
min,-100.0,-717431.2,-1385180.0,2022.0,1.0,1.0,2023.0,1.0,1.0
25%,-13.75,-529189.0,-1.342167,2023.0,5.0,8.0,2025.0,4.0,8.0
50%,2.5,-224214.2,0.0001336,2023.0,8.0,15.0,2027.0,8.0,16.0
75%,31.25,-185713.7,7.386112,2023.0,10.0,23.0,2031.0,10.0,24.0
max,100.0,-29113.44,1110263.0,2024.0,12.0,31.0,2039.0,12.0,31.0


In [8]:
le = LabelEncoder()
df['Expiry Bucket'] = le.fit_transform(df['Expiry Bucket'])
df['Tenor Bucket'] = le.fit_transform(df['Tenor Bucket'])
X = df.drop('TV',axis=1)
y = df['TV']
df.head()

Unnamed: 0,Zero Rate Shock,TV,Expiry Bucket,Tenor Bucket,Vega,Value Year,Value Month,Value Day,Expiry Year,Expiry Month,Expiry Day
0,-100,-227907.098775,3,0,1.962246,2022,9,2,2023,9,4
1,-50,-222208.400967,3,0,-3.812341,2022,9,2,2023,9,4
2,-25,-218960.927995,3,0,4.471006,2022,9,2,2023,9,4
3,-10,-216872.430106,3,0,4.333398,2022,9,2,2023,9,4
4,-5,-216146.310328,3,0,5.679687,2022,9,2,2023,9,4


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2024)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2743372, 10) (1175732, 10) (2743372,) (1175732,)


## Linear Regression模型及调参

调参参考链接 
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [10]:
lr = LinearRegression()
lr.fit(X_train,y_train)

param_grid = {'n_jobs':[2,10]}    
grid_search = GridSearchCV(lr, param_grid)    
grid_search.fit(X_train,y_train)
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)
best_lr_model = LinearRegression(n_jobs=best_parameters['n_jobs']).fit(X_train,y_train)
preds1 = best_lr_model.predict(X_test)

print('LinearRegression Test r2:',best_lr_model.score(X_test,y_test))

copy_X True
fit_intercept True
n_jobs 2
positive False
LinearRegression Test r2: 0.09788330025932113


## Random Forest模型及调参

调参参考链接：
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [11]:
rf = RandomForestRegressor(min_samples_leaf = 0.5, random_state=0)
rf.fit(X_train,y_train)

param_grid = {'n_estimators':[2,5]}    
grid_search = GridSearchCV(rf, param_grid, n_jobs = 1, verbose=10)    
grid_search.fit(X_train,y_train)
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)
best_rf_model = RandomForestRegressor( n_estimators=best_parameters['n_estimators'],min_samples_leaf = 0.5, random_state=0).fit(X_train,y_train)
preds2 = best_rf_model.predict(X_test)

print('RandomForestRegressor Test r2:',best_rf_model.score(X_test,y_test))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START n_estimators=2..............................................
[CV 1/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 2/5; 1/2] START n_estimators=2..............................................
[CV 2/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 3/5; 1/2] START n_estimators=2..............................................
[CV 3/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 4/5; 1/2] START n_estimators=2..............................................
[CV 4/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 5/5; 1/2] START n_estimators=2..............................................
[CV 5/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.7s
[CV 1/5; 2/2] START n_estimators=5..............................................
[CV 1/5; 2/2] END ..............n_estimators=5;, 

## XGBoost模型及调参

调参参考链接：
https://xgboost.readthedocs.io/en/latest/parameter.html

In [12]:
xgb = XGBRegressor(random_state=0)
xgb.fit(X_train,y_train)

param_grid = {'max_depth':[4,6],'min_child_weight':[1,2]}    
grid_search = GridSearchCV(xgb, param_grid, n_jobs = 1, verbose=10)    
grid_search.fit(X_train,y_train)
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)
best_xgb_model = XGBRegressor(max_depth=best_parameters['max_depth'], 
                      min_child_weight=best_parameters['min_child_weight'],random_state=0).fit(X_train,y_train)
preds3 = best_xgb_model.predict(X_test)
print('XGB Test r2:',best_xgb_model.score(X_test,y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 1/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.431 total time=  10.2s
[CV 2/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 2/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.431 total time=  10.6s
[CV 3/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 3/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.433 total time=  10.9s
[CV 4/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 4/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.431 total time=  11.3s
[CV 5/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 5/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.434 total time=  11.4s
[CV 1/5; 2/4] START max_depth=4, min_child_weight=2.............................
[CV 1/5; 2/4] END max_depth=4, min_chil

## Predict Vega

In [13]:
df = pd.read_csv('/Users/user/Documents/project/trade-price-ir-vegas.csv')

In [14]:
df.head()

Unnamed: 0,Value Date,Trade Name,Trade Currency,Zero Rate Shock,TV,Expiry Bucket,Expiry Date,Tenor Bucket,Vega
0,2022-09-02,dummyTrade1,USD,-100,-227907.098775,1y,2023-09-04,10y,1.962246
1,2022-09-02,dummyTrade1,USD,-50,-222208.400967,1y,2023-09-04,10y,-3.812341
2,2022-09-02,dummyTrade1,USD,-25,-218960.927995,1y,2023-09-04,10y,4.471006
3,2022-09-02,dummyTrade1,USD,-10,-216872.430106,1y,2023-09-04,10y,4.333398
4,2022-09-02,dummyTrade1,USD,-5,-216146.310328,1y,2023-09-04,10y,5.679687


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919104 entries, 0 to 3919103
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Value Date       object 
 1   Trade Name       object 
 2   Trade Currency   object 
 3   Zero Rate Shock  int64  
 4   TV               float64
 5   Expiry Bucket    object 
 6   Expiry Date      object 
 7   Tenor Bucket     object 
 8   Vega             float64
dtypes: float64(2), int64(1), object(6)
memory usage: 269.1+ MB


In [16]:
df['Value Date'] = pd.to_datetime(df['Value Date'])
df['Expiry Date'] = pd.to_datetime(df['Expiry Date'])
df= df.drop(['Trade Name', 'Trade Currency'],axis = 1)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919104 entries, 0 to 3919103
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Value Date       datetime64[ns]
 1   Zero Rate Shock  int64         
 2   TV               float64       
 3   Expiry Bucket    object        
 4   Expiry Date      datetime64[ns]
 5   Tenor Bucket     object        
 6   Vega             float64       
dtypes: datetime64[ns](2), float64(2), int64(1), object(2)
memory usage: 209.3+ MB


Unnamed: 0,Value Date,Zero Rate Shock,TV,Expiry Bucket,Expiry Date,Tenor Bucket,Vega
0,2022-09-02,-100,-227907.098775,1y,2023-09-04,10y,1.962246
1,2022-09-02,-50,-222208.400967,1y,2023-09-04,10y,-3.812341
2,2022-09-02,-25,-218960.927995,1y,2023-09-04,10y,4.471006
3,2022-09-02,-10,-216872.430106,1y,2023-09-04,10y,4.333398
4,2022-09-02,-5,-216146.310328,1y,2023-09-04,10y,5.679687


In [17]:
df['Value Year'] = df['Value Date'].dt.year
df['Value Month'] = df['Value Date'].dt.month
df['Value Day'] = df['Value Date'].dt.day
df['Expiry Year'] = df['Expiry Date'].dt.year
df['Expiry Month'] = df['Expiry Date'].dt.month
df['Expiry Day'] = df['Expiry Date'].dt.day
df = df.drop(['Value Date', 'Expiry Date'],axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919104 entries, 0 to 3919103
Data columns (total 11 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Zero Rate Shock  int64  
 1   TV               float64
 2   Expiry Bucket    object 
 3   Tenor Bucket     object 
 4   Vega             float64
 5   Value Year       int32  
 6   Value Month      int32  
 7   Value Day        int32  
 8   Expiry Year      int32  
 9   Expiry Month     int32  
 10  Expiry Day       int32  
dtypes: float64(2), int32(6), int64(1), object(2)
memory usage: 239.2+ MB


In [18]:
df.describe()

Unnamed: 0,Zero Rate Shock,TV,Vega,Value Year,Value Month,Value Day,Expiry Year,Expiry Month,Expiry Day
count,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0,3919104.0
mean,6.25,-347139.5,17.14511,2022.798,7.473214,15.6131,2028.311,7.314484,16.09689
std,51.36167,185618.0,2850.144,0.4700498,3.474293,8.903535,4.410479,3.474583,8.879236
min,-100.0,-717431.2,-1385180.0,2022.0,1.0,1.0,2023.0,1.0,1.0
25%,-13.75,-529189.0,-1.342167,2023.0,5.0,8.0,2025.0,4.0,8.0
50%,2.5,-224214.2,0.0001336,2023.0,8.0,15.0,2027.0,8.0,16.0
75%,31.25,-185713.7,7.386112,2023.0,10.0,23.0,2031.0,10.0,24.0
max,100.0,-29113.44,1110263.0,2024.0,12.0,31.0,2039.0,12.0,31.0


In [19]:
le = LabelEncoder()
df['Expiry Bucket'] = le.fit_transform(df['Expiry Bucket'])
df['Tenor Bucket'] = le.fit_transform(df['Tenor Bucket'])
X = df.drop('Vega',axis=1)
y = df['Vega']
df.head()

Unnamed: 0,Zero Rate Shock,TV,Expiry Bucket,Tenor Bucket,Vega,Value Year,Value Month,Value Day,Expiry Year,Expiry Month,Expiry Day
0,-100,-227907.098775,3,0,1.962246,2022,9,2,2023,9,4
1,-50,-222208.400967,3,0,-3.812341,2022,9,2,2023,9,4
2,-25,-218960.927995,3,0,4.471006,2022,9,2,2023,9,4
3,-10,-216872.430106,3,0,4.333398,2022,9,2,2023,9,4
4,-5,-216146.310328,3,0,5.679687,2022,9,2,2023,9,4


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2024)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2743372, 10) (1175732, 10) (2743372,) (1175732,)


## Linear Regression模型及调参

调参参考链接 
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [21]:
lr = LinearRegression()
lr.fit(X_train,y_train)

param_grid = {'n_jobs':[2,10]}    
grid_search = GridSearchCV(lr, param_grid)    
grid_search.fit(X_train,y_train)
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)
best_lr_model = LinearRegression(n_jobs=best_parameters['n_jobs']).fit(X_train,y_train)
preds1 = best_lr_model.predict(X_test)

print('LinearRegression Test r2:',best_lr_model.score(X_test,y_test))

copy_X True
fit_intercept True
n_jobs 2
positive False
LinearRegression Test r2: 2.8990077477586418e-05


## Random Forest模型及调参

调参参考链接：
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [22]:
rf = RandomForestRegressor(min_samples_leaf = 0.5,random_state=0)
rf.fit(X_train,y_train)

param_grid = {'n_estimators':[2,5]}    
grid_search = GridSearchCV(rf, param_grid, n_jobs = 1, verbose=10)    
grid_search.fit(X_train,y_train)
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)
best_rf_model = RandomForestRegressor( n_estimators=best_parameters['n_estimators'],min_samples_leaf = 0.5,random_state=0).fit(X_train,y_train)
preds2 = best_rf_model.predict(X_test)

print('RandomForestRegressor Test r2:',best_rf_model.score(X_test,y_test))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START n_estimators=2..............................................
[CV 1/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 2/5; 1/2] START n_estimators=2..............................................
[CV 2/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.7s
[CV 3/5; 1/2] START n_estimators=2..............................................
[CV 3/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 4/5; 1/2] START n_estimators=2..............................................
[CV 4/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 5/5; 1/2] START n_estimators=2..............................................
[CV 5/5; 1/2] END ..............n_estimators=2;, score=-0.000 total time=   0.8s
[CV 1/5; 2/2] START n_estimators=5..............................................
[CV 1/5; 2/2] END ..............n_estimators=5;, 

## XGBoost模型及调参

调参参考链接：
https://xgboost.readthedocs.io/en/latest/parameter.html

In [23]:
xgb = XGBRegressor(random_state=0)
xgb.fit(X_train,y_train)

param_grid = {'max_depth':[4,6],'min_child_weight':[1,2]}    
grid_search = GridSearchCV(xgb, param_grid, n_jobs = 1, verbose=10)    
grid_search.fit(X_train,y_train)
best_parameters = grid_search.best_estimator_.get_params()    
for para, val in list(best_parameters.items()):    
    print(para, val)
best_xgb_model = XGBRegressor(max_depth=best_parameters['max_depth'], 
                      min_child_weight=best_parameters['min_child_weight'],random_state=0).fit(X_train,y_train)
preds3 = best_xgb_model.predict(X_test)
print('XGB Test r2:',best_xgb_model.score(X_test,y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 1/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.011 total time=   9.9s
[CV 2/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 2/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.013 total time=  10.4s
[CV 3/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 3/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.014 total time=  10.9s
[CV 4/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 4/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.015 total time=  11.0s
[CV 5/5; 1/4] START max_depth=4, min_child_weight=1.............................
[CV 5/5; 1/4] END max_depth=4, min_child_weight=1;, score=0.012 total time=  11.2s
[CV 1/5; 2/4] START max_depth=4, min_child_weight=2.............................
[CV 1/5; 2/4] END max_depth=4, min_chil

__________________________________