In [None]:
!pip install yfinance

In [1]:
import pandas as pd
import yfinance as yf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/IEOR4571 Final Project

/content/drive/MyDrive/IEOR4571 Final Project


# Download Data

In [2]:
# Take all previous trading history
start_date = '2006-05-22'
end_date = '2024-12-05'

# Download data from Yahoo Finance
data = yf.download('GDX', start = start_date, end = end_date, interval='1d',
                   multi_level_index=False, progress=False)

# Reset index and keep relevant columns
data = data.reset_index()
data = data.drop(['Adj Close', 'Volume'], axis=1)

# Print data
data.head()

Unnamed: 0,Date,Close,High,Low,Open
0,2006-05-22,37.23,37.290001,35.869999,36.52
1,2006-05-23,37.959999,39.220001,37.75,37.75
2,2006-05-24,36.52,37.57,35.869999,37.130001
3,2006-05-25,38.32,38.32,36.98,37.18
4,2006-05-26,38.549999,38.740002,37.77,38.740002


# Generate Buy Signal and Calculate Daily P&L

In [3]:
def create_conditions(data, alpha, beta):
    """
    Create stop loss condition and take profit condition based on given alpha (stop loss limit) and beta (take profit limit).

    Parameters:
    - data (DataFrame): DataFrame with columns 'Open', 'High', 'Low', 'Close'.
    - alpha (float): Stop loss percentage.
    - beta (float): Take profit percentage.

    Returns:
    - DataFrame: Original DataFrame with additional 'stop_loss' and 'take_profit' columns.
    """
    # Create stop loss and take profit conditions
    data['stop_loss'] = data['Low'] < data['Open'] * (1 - alpha)
    data['take_profit'] = data['High'] > data['Open'] * (1 + beta)

    return data

In [4]:
def calculate_pl(row, alpha, beta):
    """
    Calculate the daily profit and loss (P&L) based on given alpha (stop loss limit) and beta (take profit limit).

    Parameters:
    - row (Series): A row of DataFrame containing 'Open', 'High', 'Low', 'Close', 'take_profit', 'stop_loss', and 'buy_signal' columns.
    - alpha (float): Stop loss percentage.
    - beta (float): Take profit percentage.

    Returns:
    - float: The calculated P&L value for the given row.
    """
    if row['take_profit']:
        return row['Open'] * beta # Profit when reaching take profit limit
    elif row['stop_loss']:
        return -row['Open'] * alpha  # Loss when reaching stop loss limit
    else:
        return row['Close'] - row['Open']  # P&L when in between

# Search for Alpha and Beta

## Logistic Regression

### `or` strategy

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal (you can choose any of the three strategies by uncommenting the relevant line)
        # or strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for Grid Search
        param_grid = [
            {'solver': ['lbfgs', 'liblinear'],
             'penalty': ['l2'],
             'C': [0.01, 0.1, 1, 10, 100],
             'class_weight': [None, 'balanced']}
        ]

        # Initialize Logistic Regression
        logreg = LogisticRegression(random_state=42)

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=logreg,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        logreg_best = LogisticRegression(
            penalty=best_params['penalty'],
            C=best_params['C'],
            solver=best_params['solver'],
            class_weight=best_params['class_weight'],
            random_state=42
        )

        logreg_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = logreg_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = logreg_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,10.546602
1,0.01,0.03,-47.798318
2,0.01,0.05,-125.0762
3,0.01,0.07,-130.5155
4,0.01,0.09,-136.7512
5,0.03,0.01,79.937301
6,0.03,0.03,1.0836
7,0.03,0.05,-26.826802
8,0.03,0.07,-37.872202
9,0.03,0.09,-45.040402


### `Daily P&L` strategy

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal (you can choose any of the three strategies by uncommenting the relevant line)
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for Grid Search
        param_grid = [
            {'solver': ['lbfgs', 'liblinear'],
             'penalty': ['l2'],
             'C': [0.01, 0.1, 1, 10, 100],
             'class_weight': [None, 'balanced']}
        ]

        # Initialize Logistic Regression
        logreg = LogisticRegression(random_state=42)

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=logreg,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        logreg_best = LogisticRegression(
            penalty=best_params['penalty'],
            C=best_params['C'],
            solver=best_params['solver'],
            class_weight=best_params['class_weight'],
            random_state=42
        )

        logreg_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = logreg_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = logreg_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,128.202604
1,0.01,0.03,148.656987
2,0.01,0.05,151.030891
3,0.01,0.07,152.34009
4,0.01,0.09,152.340793
5,0.03,0.01,135.150803
6,0.03,0.03,175.072484
7,0.03,0.05,179.070505
8,0.03,0.07,180.086708
9,0.03,0.09,179.600008


### `if not` strategy

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal (you can choose any of the three strategies by uncommenting the relevant line)
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for Grid Search
        param_grid = [
            {'solver': ['lbfgs', 'liblinear'],
             'penalty': ['l2'],
             'C': [0.01, 0.1, 1, 10, 100],
             'class_weight': [None, 'balanced']}
        ]

        # Initialize Logistic Regression
        logreg = LogisticRegression(random_state=42)

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=logreg,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        logreg_best = LogisticRegression(
            penalty=best_params['penalty'],
            C=best_params['C'],
            solver=best_params['solver'],
            class_weight=best_params['class_weight'],
            random_state=42
        )

        logreg_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = logreg_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = logreg_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,104.40061
1,0.01,0.03,145.474292
2,0.01,0.05,149.300491
3,0.01,0.07,150.316694
4,0.01,0.09,149.829994
5,0.03,0.01,35.398613
6,0.03,0.03,53.780182
7,0.03,0.05,54.040485
8,0.03,0.07,55.056688
9,0.03,0.09,54.569988


## KNN

### `or` strategy

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for KNN
        param_grid = [
            {'n_neighbors': [3, 5, 7, 10, 15],  # Different values for the number of neighbors
             'weights': ['uniform', 'distance'],  # Uniform weights or distance-based weights
             'metric': ['euclidean', 'manhattan']}  # Distance metrics to use for neighbors
        ]

        # Initialize KNN Classifier
        knn = KNeighborsClassifier()

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=knn,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        knn_best = KNeighborsClassifier(
            n_neighbors=best_params['n_neighbors'],
            weights=best_params['weights'],
            metric=best_params['metric']
        )

        knn_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = knn_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = knn_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,9.081602
1,0.01,0.03,-15.344605
2,0.01,0.05,-35.207306
3,0.01,0.07,-36.681104
4,0.01,0.09,-37.167804
5,0.03,0.01,50.488116
6,0.03,0.03,-1.6179
7,0.03,0.05,-2.9604
8,0.03,0.07,-2.9604
9,0.03,0.09,-7.4055


### `Daily P&L` strategy

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for KNN
        param_grid = [
            {'n_neighbors': [3, 5, 7, 10, 15],  # Different values for the number of neighbors
             'weights': ['uniform', 'distance'],  # Uniform weights or distance-based weights
             'metric': ['euclidean', 'manhattan']}  # Distance metrics to use for neighbors
        ]

        # Initialize KNN Classifier
        knn = KNeighborsClassifier()

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=knn,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        knn_best = KNeighborsClassifier(
            n_neighbors=best_params['n_neighbors'],
            weights=best_params['weights'],
            metric=best_params['metric']
        )

        knn_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = knn_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = knn_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,86.656201
1,0.01,0.03,95.257908
2,0.01,0.05,91.231402
3,0.01,0.07,89.172807
4,0.01,0.09,92.004712
5,0.03,0.01,92.821611
6,0.03,0.03,139.495492
7,0.03,0.05,146.240497
8,0.03,0.07,147.653207
9,0.03,0.09,147.166508


### `if not` strategy

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for KNN
        param_grid = [
            {'n_neighbors': [3, 5, 7, 10, 15],  # Different values for the number of neighbors
             'weights': ['uniform', 'distance'],  # Uniform weights or distance-based weights
             'metric': ['euclidean', 'manhattan']}  # Distance metrics to use for neighbors
        ]

        # Initialize KNN Classifier
        knn = KNeighborsClassifier()

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=knn,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        knn_best = KNeighborsClassifier(
            n_neighbors=best_params['n_neighbors'],
            weights=best_params['weights'],
            metric=best_params['metric']
        )

        knn_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = knn_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = knn_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,62.918114
1,0.01,0.03,79.407706
2,0.01,0.05,81.291908
3,0.01,0.07,81.354811
4,0.01,0.09,81.08291
5,0.03,0.01,53.563524
6,0.03,0.03,80.839498
7,0.03,0.05,83.124201
8,0.03,0.07,83.406104
9,0.03,0.09,83.134204


## Random Forest

### `or` strategy

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for Random Forest
        param_grid = {
            'n_estimators': [50, 100, 200],  # Number of trees in the forest
            'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
            'max_features': ['sqrt', 'log2'],  # Number of features to consider at each split
            'class_weight': [None, 'balanced']  # Class weighting to deal with imbalance
        }

        # Initialize Random Forest Classifier
        rf = RandomForestClassifier(random_state=42)

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        rf_best = RandomForestClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            max_features=best_params['max_features'],
            class_weight=best_params['class_weight'],
            random_state=42
        )

        rf_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = rf_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = rf_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,9.869008
1,0.01,0.03,-4.667909
2,0.01,0.05,-21.732911
3,0.01,0.07,-21.956709
4,0.01,0.09,-21.063414
5,0.03,0.01,42.589095
6,0.03,0.03,-4.4867
7,0.03,0.05,-2.6082
8,0.03,0.07,-3.0153
9,0.03,0.09,-3.0153


### `Daily P&L` strategy

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for Random Forest
        param_grid = {
            'n_estimators': [50, 100, 200],  # Number of trees in the forest
            'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
            'max_features': ['sqrt', 'log2'],  # Number of features to consider at each split
            'class_weight': [None, 'balanced']  # Class weighting to deal with imbalance
        }

        # Initialize Random Forest Classifier
        rf = RandomForestClassifier(random_state=42)

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        rf_best = RandomForestClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            max_features=best_params['max_features'],
            class_weight=best_params['class_weight'],
            random_state=42
        )

        rf_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = rf_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = rf_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,82.105896
1,0.01,0.03,103.557702
2,0.01,0.05,93.392299
3,0.01,0.07,89.115904
4,0.01,0.09,88.742809
5,0.03,0.01,92.855305
6,0.03,0.03,146.941991
7,0.03,0.05,147.130504
8,0.03,0.07,147.916696
9,0.03,0.09,147.429996


### `if not` strategy

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for Random Forest
        param_grid = {
            'n_estimators': [50, 100, 200],  # Number of trees in the forest
            'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
            'max_features': ['sqrt', 'log2'],  # Number of features to consider at each split
            'class_weight': [None, 'balanced']  # Class weighting to deal with imbalance
        }

        # Initialize Random Forest Classifier
        rf = RandomForestClassifier(random_state=42)

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        rf_best = RandomForestClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            max_features=best_params['max_features'],
            class_weight=best_params['class_weight'],
            random_state=42
        )

        rf_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = rf_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = rf_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,61.41471
1,0.01,0.03,73.893297
2,0.01,0.05,77.167899
3,0.01,0.07,77.230801
4,0.01,0.09,76.958901
5,0.03,0.01,61.675724
6,0.03,0.03,96.882904
7,0.03,0.05,98.767009
8,0.03,0.07,99.783212
9,0.03,0.09,99.296512


## XGBoost

### `or` strategy

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for XGBoost
        param_grid = {
            'n_estimators': [50, 100, 200],  # Number of boosting rounds
            'max_depth': [3, 5, 7],  # Maximum depth of each tree
            'learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks contribution of each tree
            'subsample': [0.8, 1.0],  # Subsample ratio of the training instance
            'colsample_bytree': [0.8, 1.0],  # Subsample ratio of columns when constructing each tree
            'scale_pos_weight': [1, 5, 10]  # Balance the positive and negative weights
        }

        # Initialize XGBoost Classifier
        xgb = XGBClassifier(random_state=42, eval_metric='logloss')

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        xgb_best = XGBClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            learning_rate=best_params['learning_rate'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            scale_pos_weight=best_params['scale_pos_weight'],
            random_state=42,
            eval_metric='logloss'
        )

        xgb_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = xgb_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = xgb_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,10.335105
1,0.01,0.03,-3.56791
2,0.01,0.05,-21.271711
3,0.01,0.07,-23.426705
4,0.01,0.09,-28.73791
5,0.03,0.01,41.000493
6,0.03,0.03,4.913305
7,0.03,0.05,-17.966402
8,0.03,0.07,-19.728
9,0.03,0.09,-8.8746


### `Daily P&L` strategy

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for XGBoost
        param_grid = {
            'n_estimators': [50, 100, 200],  # Number of boosting rounds
            'max_depth': [3, 5, 7],  # Maximum depth of each tree
            'learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks contribution of each tree
            'subsample': [0.8, 1.0],  # Subsample ratio of the training instance
            'colsample_bytree': [0.8, 1.0],  # Subsample ratio of columns when constructing each tree
            'scale_pos_weight': [1, 5, 10]  # Balance the positive and negative weights
        }

        # Initialize XGBoost Classifier
        xgb = XGBClassifier(random_state=42, eval_metric='logloss')

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        xgb_best = XGBClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            learning_rate=best_params['learning_rate'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            scale_pos_weight=best_params['scale_pos_weight'],
            random_state=42,
            eval_metric='logloss'
        )

        xgb_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = xgb_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = xgb_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,80.23629
1,0.01,0.03,112.790397
2,0.01,0.05,107.890189
3,0.01,0.07,87.343602
4,0.01,0.09,96.041793
5,0.03,0.01,92.491389
6,0.03,0.03,136.255289
7,0.03,0.05,149.520496
8,0.03,0.07,152.286703
9,0.03,0.09,151.800003


### `if not` strategy

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Initialize result storage
results = []

# Define alpha and beta values for grid search
alpha_values = [0.01, 0.03, 0.05, 0.07, 0.09]
beta_values = [0.01, 0.03, 0.05, 0.07, 0.09]

# Loop through each combination of alpha and beta
for alpha in alpha_values:
    for beta in beta_values:
        # print(f"Current alpha = {alpha} and beta = {beta}")

        # Create stop loss condition and take profit condition
        data = create_conditions(data, alpha, beta)

        # Calculate daily P&L
        data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)

        # Generate buy signal
        # or strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['take_profit'] or row['stop_loss'] else 0, axis=1)
        # Daily P&L strategy
        # data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)
        # if not strategy
        data['buy_signal'] = data.apply(lambda row: 1 if not row['stop_loss'] else 0, axis=1)

        # Train-Test split
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]

        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

        # Define parameter grid for XGBoost
        param_grid = {
            'n_estimators': [50, 100, 200],  # Number of boosting rounds
            'max_depth': [3, 5, 7],  # Maximum depth of each tree
            'learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks contribution of each tree
            'subsample': [0.8, 1.0],  # Subsample ratio of the training instance
            'colsample_bytree': [0.8, 1.0],  # Subsample ratio of columns when constructing each tree
            'scale_pos_weight': [1, 5, 10]  # Balance the positive and negative weights
        }

        # Initialize XGBoost Classifier
        xgb = XGBClassifier(random_state=42, eval_metric='logloss')

        # Perform Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid,
            cv=5,  # 5-fold cross-validation
            scoring='roc_auc',  # Optimize for AUC
            return_train_score=True
        )

        # Fit Grid Search to training data
        grid_search.fit(X_train.iloc[:, 1:5], y_train)

        # Train a model with the best parameters
        best_params = grid_search.best_params_
        xgb_best = XGBClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            learning_rate=best_params['learning_rate'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            scale_pos_weight=best_params['scale_pos_weight'],
            random_state=42,
            eval_metric='logloss'
        )

        xgb_best.fit(X_train.iloc[:, 1:5], y_train)

        # Evaluate on validation data to find the best threshold for maximizing P&L
        y_val_prob_best = xgb_best.predict_proba(X_val.iloc[:, 1:5])[:, 1]

        thresholds = np.linspace(0.1, 0.9, 9)
        best_threshold = 0.5
        best_pl = -float('inf')

        for threshold in thresholds:
            y_val_pred_threshold = (y_val_prob_best >= threshold).astype(int)
            X_val['predicted_signal'] = y_val_pred_threshold

            # Calculate Total P&L by filtering rows with a buy signal and summing their respective daily_pl
            total_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

            if total_pl > best_pl:
                best_pl = total_pl
                best_threshold = threshold

        # Evaluate on test set using the best threshold
        y_test_prob_best = xgb_best.predict_proba(X_test.iloc[:, 1:5])[:, 1]
        y_test_pred_best = (y_test_prob_best >= best_threshold).astype(int)

        # Calculate Total P&L for the test set using the best threshold
        X_test['predicted_signal'] = y_test_pred_best
        total_test_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

        # Store results
        results.append({'alpha': alpha, 'beta': beta, 'Best Test P&L': total_test_pl})

        # print(f"Best Threshold: {best_threshold}")
        # print(f"Total P&L for test set at best threshold: {total_test_pl:.2f}")

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df

# Save the results to a CSV file
# results_df.to_csv('grid_search_alpha_beta_results.csv', index=False)

Unnamed: 0,alpha,beta,Best Test P&L
0,0.01,0.01,61.061102
1,0.01,0.03,83.653495
2,0.01,0.05,83.996493
3,0.01,0.07,84.059396
4,0.01,0.09,83.787495
5,0.03,0.01,19.148018
6,0.03,0.03,38.565988
7,0.03,0.05,38.826291
8,0.03,0.07,39.842494
9,0.03,0.09,39.355794


# Evaluate Results

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [12]:
# Best alpha and beta
alpha, beta = 0.05, 0.07

# Apply conditions and calculate P&L
data = create_conditions(data, alpha, beta)
data['daily_pl'] = data.apply(lambda row: calculate_pl(row, alpha, beta), axis=1)
data['buy_signal'] = data.apply(lambda row: 1 if row['daily_pl'] > 0 else 0, axis=1)

# Train-Test split
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

results = {}
final_test_pl = {}

# Run all models
for model_name, model in models.items():
    # Fit model
    model.fit(X_train.iloc[:, 1:5], y_train)

    # Evaluate on validation data
    y_val_prob = model.predict_proba(X_val.iloc[:, 1:5])[:, 1]
    y_val_pred = (y_val_prob >= 0.5).astype(int)

    # Calculate In-sample P&L
    X_val['predicted_signal'] = y_val_pred
    in_sample_pl = X_val.loc[X_val['predicted_signal'] == 1, 'daily_pl'].sum()

    # Evaluate on test data
    y_test_prob = model.predict_proba(X_test.iloc[:, 1:5])[:, 1]
    y_test_pred = (y_test_prob >= 0.5).astype(int)

    # Calculate Out-of-sample P&L
    X_test['predicted_signal'] = y_test_pred
    out_sample_pl = X_test.loc[X_test['predicted_signal'] == 1, 'daily_pl'].sum()

    # Store results
    results[model_name] = {
        "In-sample P&L": in_sample_pl,
        "Out-of-sample P&L": out_sample_pl,
        "AUC Validation": roc_auc_score(y_val, y_val_prob),
        "AUC Test": roc_auc_score(y_test, y_test_prob)
    }
    final_test_pl[model_name] = out_sample_pl

    print(f"{model_name} - In-sample P&L: {in_sample_pl:.2f}, Out-of-sample P&L: {out_sample_pl:.2f}")
    print(f"AUC Validation: {roc_auc_score(y_val, y_val_prob):.4f}, AUC Test: {roc_auc_score(y_test, y_test_prob):.4f}")

Logistic Regression - In-sample P&L: 152.17, Out-of-sample P&L: 180.14
AUC Validation: 0.9979, AUC Test: 0.9999
KNN - In-sample P&L: 139.87, Out-of-sample P&L: 137.79
AUC Validation: 0.9424, AUC Test: 0.9026
Random Forest - In-sample P&L: 140.15, Out-of-sample P&L: 144.87
AUC Validation: 0.9571, AUC Test: 0.9097
XGBoost - In-sample P&L: 138.33, Out-of-sample P&L: 149.28
AUC Validation: 0.9451, AUC Test: 0.8952


In [17]:
# Convert results to DataFrame for easy visualization
results_df = pd.DataFrame(results).T
print("In-sample and Out-of-sample Performance Metrics:")
results_df

In-sample and Out-of-sample Performance Metrics:


Unnamed: 0,In-sample P&L,Out-of-sample P&L,AUC Validation,AUC Test
Logistic Regression,152.172302,180.136698,0.997943,0.999908
KNN,139.872297,137.786701,0.942358,0.902561
Random Forest,140.152296,144.86671,0.957078,0.909706
XGBoost,138.332294,149.276707,0.945145,0.895229


In [19]:
# Calculate weights based on test P&L
total_pl_sum = sum(final_test_pl.values())
weights = {model: pl / total_pl_sum for model, pl in final_test_pl.items()}
print("Model Weights based on Out-of-sample P&L:")
print(weights)

# Ensemble Method
ensemble_signal = np.zeros(len(X_test))
for model_name, weight in weights.items():
    y_test_prob = models[model_name].predict_proba(X_test.iloc[:, 1:5])[:, 1]
    ensemble_signal += weight * y_test_prob

# Apply threshold to ensemble signal
threshold = 0.5
final_buy_signal = (ensemble_signal >= threshold).astype(int)

# Calculate Ensemble Total P&L
X_test['ensemble_signal'] = final_buy_signal
ensemble_total_pl = X_test.loc[X_test['ensemble_signal'] == 1, 'daily_pl'].sum()

print(f"Ensemble Method Total Out-of-sample P&L: {ensemble_total_pl:.2f}")

Model Weights based on Out-of-sample P&L:
{'Logistic Regression': 0.29430887771324715, 'KNN': 0.22511709099260838, 'Random Forest': 0.23668447102285525, 'XGBoost': 0.24388956027128925}
Ensemble Method Total Out-of-sample P&L: 167.98
