In [None]:
import pandas as pd
import numpy as np

from google.colab import drive

# This will prompt you to authenticate and grant access to your Drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, mutual_info_classif

import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/FX/insample.csv')
df1.head()

Unnamed: 0,Time,Bond,Side,Notional,Counterparty,MidPrice,QuotedPrice,Competitors,Traded,nextMidPrice
0,5,US Treasury 5Y,ASK,500000,SniperFund,98.629,98.717,7,MISSED,98.686
1,14,US Treasury 5Y,BID,2500000,SleepyManager,98.502,98.443,5,DONE,98.419
2,21,US Treasury 3Y,BID,100000,RelativeValueStrategies,102.08,102.027,6,MISSED,101.971
3,26,US Treasury 10Y,BID,5000000,HF-Fortress,97.753,97.651,3,MISSED,97.788
4,31,US Treasury 3Y,ASK,100000,TankerAssetManagement,101.279,101.398,3,MISSED,101.374


In [None]:
df1['Bond'].unique()

array(['US Treasury 5Y', 'US Treasury 3Y', 'US Treasury 10Y',
       'US Treasury 30Y', 'US Treasury 2Y'], dtype=object)

In [None]:
df1 = df1.drop(columns=['Time', 'nextMidPrice'])

In [None]:
for bond in df1['Bond'].unique():
  df = df1[df1['Bond'] == bond]

  # Step 1: Preprocess the data

  # Convert categorical columns to numerical
  df['Traded'] = LabelEncoder().fit_transform(df['Traded'])  # Encoding "DONE" as 1 and "MISSED" as 0
  df['Side'] = LabelEncoder().fit_transform(df['Side'])
  df['Counterparty'] = LabelEncoder().fit_transform(df['Counterparty'])


  # Assuming df is your DataFrame and "Traded" is your target variable
  # Define features (X) and target (y)
  X = df.drop(columns=['Traded', 'Bond'])  # Drop the target column
  y = df['Traded']

  # Standardize the data
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  # Step 1: Recursive Feature Elimination (RFE) with Logistic Regression
  from sklearn.linear_model import LogisticRegression
  rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=5, step=1)
  rfe_selector = rfe_selector.fit(X_scaled, y)
  rfe_support = rfe_selector.get_support()
  rfe_features = X.columns[rfe_support]
  print("Top features selected by RFE:", list(rfe_features))

  # Step 2: Feature Importance using Random Forest
  rf = RandomForestClassifier(random_state=42)
  rf.fit(X, y)
  importance = rf.feature_importances_
  rf_importances = pd.Series(importance, index=X.columns).sort_values(ascending=False)
  print("Feature importances from Random Forest:\n", rf_importances)

  # Step 3: Mutual Information
  mutual_info = mutual_info_classif(X_scaled, y)
  mi_importances = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)
  print("Mutual Information scores:\n", mi_importances)

  # Step 4: Combining results
  # Rank features based on each method
  rfe_ranking = pd.Series(rfe_selector.ranking_, index=X.columns).rank().sort_values()
  rf_ranking = rf_importances.rank(ascending=False)
  mi_ranking = mi_importances.rank(ascending=False)

  # Average ranking
  combined_ranking = (rfe_ranking + rf_ranking + mi_ranking) / 3
  combined_ranking = combined_ranking.sort_values()
  print("Combined Feature Ranking:\n", combined_ranking)

  # Select the top features (based on a threshold, e.g., top 5)
  top_features = combined_ranking.index[:5]
  print("Top selected features based on combined ranking:", list(top_features))



  # Define features (X) and target (y), excluding 'Time'
  X = df[list(top_features)]
  y = df['Traded']

  # Split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Initialize models with basic configurations
  models = {
      "Logistic Regression": LogisticRegression(max_iter=1000),
      "Random Forest": RandomForestClassifier(random_state=42),
      "Gradient Boosting": GradientBoostingClassifier(random_state=42),
      "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
      "CatBoost": CatBoostClassifier(silent=True, random_state=42),
  }

  # Train and evaluate each model
  for model_name, model in models.items():
      print(f"Training and evaluating {model_name}...")

      # Create a pipeline with scaling
      pipeline = Pipeline([
          ('scaler', StandardScaler()),
          ('classifier', model)
      ])

      # Train the model
      pipeline.fit(X_train, y_train)

      # Predict on test data
      y_pred = pipeline.predict(X_test)

      # Evaluate the model
      print(f"Test Evaluation for {model_name}")
      print("Accuracy:", accuracy_score(y_test, y_pred))
      print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
      print("Classification Report:\n", classification_report(y_test, y_pred))
      print("="*60)


Top features selected by RFE: ['Side', 'Counterparty', 'MidPrice', 'QuotedPrice', 'Competitors']
Feature importances from Random Forest:
 MidPrice        0.260084
Competitors     0.238334
QuotedPrice     0.234491
Notional        0.115909
Counterparty    0.109558
Side            0.041624
dtype: float64
Mutual Information scores:
 Competitors     0.073986
Side            0.057513
MidPrice        0.039231
QuotedPrice     0.017111
Notional        0.000000
Counterparty    0.000000
dtype: float64
Combined Feature Ranking:
 Competitors     2.000000
MidPrice        2.333333
QuotedPrice     3.333333
Side            3.666667
Counterparty    4.500000
Notional        5.166667
dtype: float64
Top selected features based on combined ranking: ['Competitors', 'MidPrice', 'QuotedPrice', 'Side', 'Counterparty']
Training and evaluating Logistic Regression...
Test Evaluation for Logistic Regression
Accuracy: 0.7906976744186046
Confusion Matrix:
 [[ 8  4]
 [ 5 26]]
Classification Report:
               prec

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define parameter grids for each model
param_grids = {
    "Logistic Regression": {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs', 'liblinear']
    },
    "Random Forest": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    "Gradient Boosting": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__min_samples_split': [2, 5, 10]
    },
    "XGBoost": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },
    "CatBoost": {
        'classifier__iterations': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__depth': [4, 6, 8]
    },
    "LightGBM": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [-1, 10, 20],
        'classifier__num_leaves': [31, 50, 100]
    }
}

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "CatBoost": CatBoostClassifier(silent=True, random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Define features (X) and target (y), excluding 'Time'
X = df[['Competitors', 'QuotedPrice', 'MidPrice', 'nextMidPrice']]
y = df['Traded']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run Grid Search for each model
best_models = {}
for model_name, model in models.items():
    print(f"Tuning parameters for {model_name}...")

    # Create a pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

    # Set up GridSearchCV with the pipeline and parameter grid
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grids[model_name],
        cv=5,  # 5-fold cross-validation
        scoring='accuracy',
        n_jobs=-1  # Use all available cores
    )

    # Fit grid search
    grid_search.fit(X_train, y_train)

    # Save the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print best parameters and score
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")
    print("="*60)

# Evaluate best models on the test set
for model_name, best_model in best_models.items():
    y_pred = best_model.predict(X_test)
    print(f"Test Evaluation for {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("="*60)


Tuning parameters for Logistic Regression...
Best parameters for Logistic Regression: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best cross-validation accuracy for Logistic Regression: 0.7474999999999999
Tuning parameters for Random Forest...
Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Best cross-validation accuracy for Random Forest: 0.7150000000000001
Tuning parameters for Gradient Boosting...
Best parameters for Gradient Boosting: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best cross-validation accuracy for Gradient Boosting: 0.73
Tuning parameters for XGBoost...
Best parameters for XGBoost: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 5, 'classifier__n_estimators': 200, 'classifier__subsample': 1.0}
Best cross-va