In [5]:
import os, sys; sys.path.insert(0, os.path.join(sys.path[0], '..'))
import pandas as pd

print (f"Read the dataset ... ", end = "")
try: data_transformed = pd.read_csv(os.path.join("..", "assets", "final_ubs_data.csv"))
except: data_transformed = pd.read_csv(os.path.join("assets", "final_ubs_data.csv"))
columns = data_transformed.columns
columns_filtered = [column for column in columns if column != "Mean_Vega"]
data = data_transformed[columns_filtered]  # Convert the transformed file to the merged file
print ("DONE")

Read the dataset ... DONE


In [4]:
from IPython.display import display
display(data.head())

Unnamed: 0.1,Unnamed: 0,Value Date,Trade Name,Trade Currency,Zero Rate Shock,TV,Expiry Bucket,Expiry Date,Tenor Bucket,Vega,...,atm-0.5%,atm,atm+0.5%,atm+1.0%,Swap Rate,underlying,pay_frequency,maturity,lower_bound,upper_bound
0,1,2022-09-02,dummyTrade1,USD,-100,-227907.098775,1y,2023-09-04,10y,1.962246,...,0.209405,0.177525,0.21682,0.274552,2.737709,USD: CMS:2Y,6M,5Y,0.0042,0.0379
1,2,2022-09-02,dummyTrade1,USD,-50,-222208.400967,1y,2023-09-04,10y,-3.812341,...,0.209405,0.177525,0.21682,0.274552,2.737709,USD: CMS:2Y,6M,5Y,0.0042,0.0379
2,3,2022-09-02,dummyTrade1,USD,-25,-218960.927995,1y,2023-09-04,10y,4.471006,...,0.209405,0.177525,0.21682,0.274552,2.737709,USD: CMS:2Y,6M,5Y,0.0042,0.0379
3,4,2022-09-02,dummyTrade1,USD,-10,-216872.430106,1y,2023-09-04,10y,4.333398,...,0.209405,0.177525,0.21682,0.274552,2.737709,USD: CMS:2Y,6M,5Y,0.0042,0.0379
4,5,2022-09-02,dummyTrade1,USD,-5,-216146.310328,1y,2023-09-04,10y,5.679687,...,0.209405,0.177525,0.21682,0.274552,2.737709,USD: CMS:2Y,6M,5Y,0.0042,0.0379


## Install packages

In [None]:
!pip install pandas --quiet
import pandas

In [None]:
!pip install matplotlib --quiet
import matplotlib

In [9]:
!pip install xgboost --quiet
import xgboost

In [8]:
!pip install scikit-learn --quiet  # sklearn is depreciated
import sklearn

## Total Value

In [6]:
import xgboost as xgb

# The training method
tree_method = 'cpu_hist'  # 'gpu_hist'

# The callback for each epoch to show the progress
class InformEpochCallback(xgb.callback.TrainingCallback):
    def __init__(self, print_every=1):
        self.print_every = print_every
    def after_iteration(self, model, epoch, evals_log):
        if (epoch + 1) % self.print_every == 0:
            print(f"Epoch: {epoch + 1}, Train loss: {evals_log['train'][0][1]}")
        return True

## Target Value Model: Set 1

Independent columns include: 

- ```Trade Name```

- ```Zero Rate Shock (ZRS)```

- ```Expiry Bucket```

- ```Tenor Bucket```

- ```atm```

In [8]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load the data

print ("Process the data ... ", end = "")

# Select columns
selected_columns = ['Value Date', 'Trade Name', 'Zero Rate Shock', 'TV', 'Expiry Bucket', 'Tenor Bucket', 'Vega', 'atm']
data_selected = data[selected_columns]

# Convert categorical columns to numeric using get_dummies
data_processed = pd.get_dummies(data_selected, columns=['Trade Name', 'Expiry Bucket', 'Tenor Bucket', 'Value Date'])

# Define the feature set (X) and target variable (y)
X = data_processed.drop('TV', axis=1)  # Drop the target column to create the feature set
y = data_processed['TV']               # Target variable

print ("DONE")

# Split data into train and test sets (60% train, 40% test)
print ("Splitting the data ... ", end = "")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)  # random_state for reproducibility
print ("DONE")

# Create DMatrix for XGBoost
print ("Constructing training set ... ", end = "")
dtrain = xgb.DMatrix(X_train, label=y_train)
print ("DONE")
print ("Constructing testing set ... ", end = "")
dtest = xgb.DMatrix(X_test, label=y_test)
print ("DONE")

# Set parameters for XGBoost
params = {
     'tree_method': 'gpu_hist', 
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

# Train the model
num_rounds = 1  # 100
print ("Training the model ... ", end = "")
inform_epoch_callback = InformEpochCallback(print_every=1)
bst = xgb.train(params, dtrain, num_rounds, 
                callbacks = [inform_epoch_callback])
print ("DONE")

# Predictions and evaluate
predictions = bst.predict(dtest)
rmse = ((predictions - y_test) ** 2).mean() ** 0.5  # Calculating RMSE
print(f'RMSE on test set: {rmse}')

Process the data ... DONE
Splitting the data ... DONE
Constructing training set ... 

: 

### Visualize the Training Result

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Select columns
selected_columns = ['Value Date', 'Trade Name', 'Zero Rate Shock', 'TV', 'Expiry Bucket', 'Tenor Bucket', 'Vega', 'atm']
data_selected = data[selected_columns]

# Convert categorical columns to numeric using get_dummies
data_processed = pd.get_dummies(data_selected, columns=['Trade Name', 'Expiry Bucket', 'Tenor Bucket', 'Value Date'])

# Define the feature set (X) and target variable (y)
X = data_processed.drop('TV', axis=1)  # Drop the target column to create the feature set
y = data_processed['TV']               # Target variable

# Now, plot the distribution of 'TV'
plt.figure(figsize=(10, 6))
plt.hist(y, bins=50, alpha=0.75)
plt.title('Distribution of TV')
plt.xlabel('TV')
plt.ylabel('Frequency')
plt.show()

# Also, print some statistics
print("Minimum TV:", y.min())
print("Maximum TV:", y.max())
print("Mean TV:", y.mean())
print("Standard Deviation of TV:", y.std())

Set 2

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt

# Select columns
selected_columns = ['Value Date', 'Trade Name', 'Zero Rate Shock', 'TV', 'Expiry Bucket', 'Tenor Bucket', 'Vega', 'atm-1.0%', 'atm-0.5%', 'atm', 'atm+0.5%', 
                    'atm+1.0%', 'Swap Rate', 'pay_frequency', 'maturity','lower_bound', 'upper_bound']
data_selected = data[selected_columns]

# Convert categorical columns to numeric using get_dummies
data_processed = pd.get_dummies(data_selected, columns=['Trade Name', 'Expiry Bucket', 'Tenor Bucket', 'Value Date','pay_frequency','maturity'])

# Define the feature set (X) and target variable (y)
X = data_processed.drop('TV', axis=1)  # Features
y = data_processed['TV']               # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Configure parameters for XGBoost
params = {
    'tree_method': 'gpu_hist',
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

# Train the model
num_rounds = 200
bst = xgb.train(params, dtrain, num_rounds)

# Predictions
predictions = bst.predict(dtest)

# Evaluate the model
rmse = ((predictions - y_test) ** 2).mean() ** 0.5
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
explained_variance = explained_variance_score(y_test, predictions)

# Print the evaluation results
print(f"RMSE: {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")
print(f"Explained Variance Score: {explained_variance}")

# Plot residuals
residuals = y_test - predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, residuals)
plt.title('Residuals vs. Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt

# Select columns
selected_columns = ['Value Date', 'Trade Name', 'Zero Rate Shock', 'TV', 'Expiry Bucket', 'Tenor Bucket', 'Vega', 'atm-1.0%', 'atm-0.5%', 'atm', 'atm+0.5%', 
                    'atm+1.0%', 'Swap Rate', 'pay_frequency', 'maturity','lower_bound', 'upper_bound']
data_selected = data[selected_columns]

# Convert categorical columns to numeric using get_dummies
data_processed = pd.get_dummies(data_selected, columns=['Trade Name', 'Expiry Bucket', 'Tenor Bucket', 'Value Date','pay_frequency','maturity'])

# Define the feature set (X) and target variable (y)
X = data_processed.drop('TV', axis=1)  # Features
y = data_processed['TV']               # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Configure parameters for XGBoost
params = {
    'tree_method': 'gpu_hist',
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

# Train the model
num_rounds = 1  # 450
bst = xgb.train(params, dtrain, num_rounds)

# Predictions
predictions = bst.predict(dtest)

# Evaluate the model
rmse = ((predictions - y_test) ** 2).mean() ** 0.5
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
explained_variance = explained_variance_score(y_test, predictions)

# Print the evaluation results
print(f"RMSE: {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")
print(f"Explained Variance Score: {explained_variance}")

# Plot residuals
residuals = y_test - predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, residuals)
plt.title('Residuals vs. Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

## Vega

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt

# Select columns
selected_columns = ['Value Date', 'Trade Name', 'Zero Rate Shock', 'TV', 'Expiry Bucket', 'Tenor Bucket', 'Vega', 'atm-1.0%', 'atm-0.5%', 'atm', 'atm+0.5%', 
                    'atm+1.0%', 'Swap Rate', 'pay_frequency', 'maturity','lower_bound', 'upper_bound']
data_selected = data[selected_columns]

# Convert categorical columns to numeric using get_dummies
data_processed = pd.get_dummies(data_selected, columns=['Trade Name', 'Expiry Bucket', 'Tenor Bucket', 'Value Date','pay_frequency','maturity'])

# Define the feature set (X) and target variable (y)
X = data_processed.drop('Vega', axis=1)  # Assuming you're predicting Vega as before
y = data_processed['Vega']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Configure parameters for XGBoost
params = {
    'tree_method': 'gpu_hist',
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

# Train the model
num_rounds = 1  # 200
bst = xgb.train(params, dtrain, num_rounds)

# Predictions
predictions = bst.predict(dtest)

# Evaluate the model
rmse = ((predictions - y_test) ** 2).mean() ** 0.5
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
explained_variance = explained_variance_score(y_test, predictions)

# Print the evaluation results
print(f"RMSE: {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")
print(f"Explained Variance Score: {explained_variance}")

# Plot residuals
residuals = y_test - predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, residuals)
plt.title('Residuals vs. Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

### Feature Engineering File