In [12]:
# Importing necessary libraries
import pandas as pd  # for data manipulation and analysis
import numpy as np  # for numerical operations
from sklearn.model_selection import train_test_split, GridSearchCV  # for splitting the data and hyperparameter tuning
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor  # for machine learning models
from sklearn.metrics import mean_squared_error  # for evaluating model performance
import xgboost as xgb  # for the XGBoost model

# Function to load CSV data
def load_data(file_path):
    """
    This function loads the CSV data from the specified file path.
    """
    df = pd.read_csv(file_path)
    return df

def explore_data(df):
    # Visualize the first and last five rows of the dataset
    print("First five rows of the dataset:")
    print(df.head())
    print("\nLast five rows of the dataset:")
    print(df.tail())
    
def explore_preprocesseddata(X,y):    
    # Visualize the preprocessed data
    print("First five rows of the features (X):")
    print(X.head())
    print("\nFirst five rows of the target (y):")
    print(y.head())



# Function to preprocess the data
def preprocess_data(df):
    print("Before preprocessing:")
    print(df.head())
    
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Convert 'Date' column to datetime
    print("After converting 'Date' column:")
    print(df.head())
    
    df.set_index('Date', inplace=True)  # Set 'Date' as the index
    
    # Create new features based on the difference of consecutive rows
    df['Price_diff'] = df['Price'].diff()
    df['Open_diff'] = df['Open'].diff()
    df['High_diff'] = df['High'].diff()
    df['Low_diff'] = df['Low'].diff()
    df['Change_diff'] = df['Change %'].str.replace('%', '', regex=False).astype(float).diff()
    
    # Fill missing 'Vol.' values with 0 and calculate the difference
    df['Vol.'] = df['Vol.'].replace('', 0).astype(float)
    df['Vol_diff'] = df['Vol.'].diff()

    df.dropna(inplace=True)  # Drop rows with any missing values

    print("After preprocessing:")
    print(df.head())

    # Define features (X) and target (y)
    X = df[['Open', 'High', 'Low', 'Vol.', 'Price_diff', 'Open_diff', 'High_diff', 'Low_diff', 'Vol_diff', 'Change_diff']]
    y = df['Price']
    
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    return X, y


# Function to split data into training and testing sets
def train_test_split_data(X, y):
    """
    This function splits the features (X) and target (y) into training and testing sets.
    """
    return train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

# Function to train and evaluate models
def train_evaluate_model(X_train, X_test, y_train, y_test):
    """
    This function trains and evaluates multiple models:
    1. RandomForestRegressor with GridSearchCV for hyperparameter tuning.
    2. XGBoostRegressor with GridSearchCV for hyperparameter tuning.
    3. BaggingRegressor with the best RandomForest and XGBoost models.
    4. Calculates RMSE for each model and returns the results.
    """
    # Train and tune RandomForestRegressor
    rf = RandomForestRegressor(random_state=42)
    param_grid_rf = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10]}
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)
    best_rf = grid_search_rf.best_estimator_

    # Predict and calculate RMSE for RandomForestRegressor
    y_pred_rf = best_rf.predict(X_test)
    rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

    # Train and tune XGBoostRegressor
    xgb_model = xgb.XGBRegressor(random_state=42)
    param_grid_xgb = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15], 'learning_rate': [0.01, 0.1, 0.3]}
    grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)
    best_xgb = grid_search_xgb.best_estimator_

    # Predict and calculate RMSE for XGBoostRegressor
    y_pred_xgb = best_xgb.predict(X_test)
    rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

    # Train and evaluate BaggingRegressor with RandomForest
    bagging_rf = BaggingRegressor(base_estimator=best_rf, n_estimators=10, random_state=42)
    bagging_rf.fit(X_train, y_train)
    y_pred_bagging_rf = bagging_rf.predict(X_test)
    rmse_bagging_rf = np.sqrt(mean_squared_error(y_test, y_pred_bagging_rf))

    # Train and evaluate BaggingRegressor with XGBoost
    bagging_xgb = BaggingRegressor(base_estimator=best_xgb, n_estimators=10, random_state=42)
    bagging_xgb.fit(X_train, y_train)
    y_pred_bagging_xgb = bagging_xgb.predict(X_test)
    rmse_bagging_xgb = np.sqrt(mean_squared_error(y_test, y_pred_bagging_xgb))

    # Calculate final predictions and RMSE for ensemble model
    final_pred = (y_pred_bagging_rf + y_pred_bagging_xgb) / 2
    final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))

    return rmse_rf, rmse_xgb, rmse_bagging_rf, rmse_bagging_xgb, final_rmse

# Main function to execute the workflow
def main():
    file_path = 'forex.csv'  # Path to your CSV file
    df = load_data(file_path)  # Load the data
    explore_data(df)
    X, y = preprocess_data(df)  # Preprocess the data
    explore_preprocesseddata(X,y)
    X_train, X_test, y_train, y_test = train_test_split_data(X, y)  # Split data into training and testing sets
    results = train_evaluate_model(X_train, X_test, y_train, y_test)  # Train and evaluate models

    # Print the RMSE for each model
    print(f'Random Forest RMSE: {results[0]}')
    print(f'XGBoost RMSE: {results[1]}')
    print(f'Bagging Random Forest RMSE: {results[2]}')
    print(f'Bagging XGBoost RMSE: {results[3]}')
    print(f'Ensemble Model RMSE: {results[4]}')

# Ensure the script runs only if executed as the main module
if __name__ == "__main__":
    main()


First five rows of the dataset:
         Date   Price    Open    High     Low  Vol. Change %
0  07/19/2024  171.30  171.48  171.90  170.90   NaN   -0.08%
1  07/18/2024  171.44  170.82  171.60  169.98   NaN    0.38%
2  07/17/2024  170.79  172.59  172.86  170.69   NaN   -1.01%
3  07/16/2024  172.54  172.17  172.96  172.13   NaN    0.23%
4  07/15/2024  172.14  172.20  172.57  171.57   NaN   -0.03%

Last five rows of the dataset:
            Date   Price    Open    High     Low  Vol. Change %
6397  01/07/2000  108.44  108.67  109.03  107.74   NaN   -0.20%
6398  01/06/2000  108.66  107.65  109.31  107.14   NaN    0.99%
6399  01/05/2000  107.59  106.34  107.75  105.65   NaN    1.04%
6400  01/04/2000  106.48  104.02  106.60  103.92   NaN    2.10%
6401  01/03/2000  104.29  102.66  104.39  102.07   NaN    1.22%
Before preprocessing:
         Date   Price    Open    High     Low  Vol. Change %
0  07/19/2024  171.30  171.48  171.90  170.90   NaN   -0.08%
1  07/18/2024  171.44  170.82  171.60  169

ValueError: With n_samples=0, test_size=0.2 and train_size=0.8, the resulting train set will be empty. Adjust any of the aforementioned parameters.