In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Function to create lag features and rolling statistics
def create_lag_features(data, lags):
    for lag in range(1, lags + 1):
        data[f'lag_{lag}'] = data['Close'].shift(lag)
    data['rolling_mean'] = data['Close'].rolling(window=5).mean()
    data['rolling_std'] = data['Close'].rolling(window=5).std()
    return data

# Function to split data into training and testing sets
def split_data(df, training_period, p):
    # Create lag features and rolling statistics
    df = create_lag_features(df, training_period)
    df.dropna(inplace=True)  # Remove rows with NaN values
    
    # Calculate the split index based on p%
    split_idx = int(len(df) * p)
    
    # Split into training and testing data
    train_data = df.iloc[:split_idx]
    test_data = df.iloc[split_idx:]
    
    # Separate features and targets
    X_train = train_data.drop(columns=['Close'])
    y_train = train_data['Close']
    X_test = test_data.drop(columns=['Close'])
    y_test = test_data['Close']
    
    return X_train, y_train, X_test, y_test, split_idx, test_data.index


# Function to train the XGBoost model
def train_xgboost(X_train, y_train):
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        objective='reg:squarederror',
        random_state=42
    )
    model.fit(X_train, y_train)
    return model

# Function to evaluate and visualize predictions
def evaluate_and_visualize(model, X_test, y_test, data, train_end_idx, test_idx, n_days_to_predict, p):
    predictions = model.predict(X_test)
    rmse = math.sqrt(mean_squared_error(y_test, predictions))
    print(f'p={p:.2f}, RMSE: {rmse}')
    
    # Create directory if it doesn't exist
    os.makedirs(f"TSLA_pred/xdboost/25_15", exist_ok=True)

    # Determine the range of indices for plotting
    train_end_date = data.index[train_end_idx]  # Get the training end date
    train_start_idx = max(0, train_end_idx - 20)  # Ensure we don't go out of bounds
    train_start_date = data.index[train_start_idx]
    test_dates = test_idx[:n_days_to_predict]  # Get test dates for predictions

    # Plot the results
    plt.figure(figsize=(14, 7))
    plt.plot(data.loc[train_start_date:train_end_date].index, 
             data.loc[train_start_date:train_end_date]['Close'], 
             label='True Training Data', color='blue')
    plt.plot(test_dates, y_test[:n_days_to_predict], label='True Test Data', color='green')
    plt.plot(test_dates, predictions[:n_days_to_predict], label='XGBoost Predictions', color='orange')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.title('Stock Price Prediction Using XGBoost')
    plt.legend()
    plt.savefig(f"TSLA_pred/xdboost/25_15/{p:.2f}.png")
    plt.close()


# Main function to run the prediction for a given percentage split
def m_xgb(p):
    training_period = 0  # Number of days for lag features
    n_days_to_predict = 145  # Number of days to predict
    path = "TSLA.csv"  # Path to your stock data file

    # Load and preprocess data
    headers = ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"]
    df = pd.read_csv(path, names=headers, skiprows=1)
    df.replace("null", np.nan, inplace=True)
    df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] = df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].astype(float)
    df.set_index("Date", inplace=True)

    # Split the dataset based on p%
    X_train, y_train, X_test, y_test, train_end_idx, test_idx = split_data(df, training_period, p)

    # Train the XGBoost model
    model = train_xgboost(X_train, y_train)

    # Evaluate and visualize the predictions
    evaluate_and_visualize(model, X_test, y_test, df, train_end_idx, test_idx, n_days_to_predict, p)

# Iterate over different percentages
for p in np.linspace(0.60, 0.9, 15):
    m_xgb(p)


p=0.60, RMSE: 142.4283748267969
p=0.62, RMSE: 146.38563389931772
p=0.64, RMSE: 150.6980704754292
p=0.66, RMSE: 155.46327809827682
p=0.69, RMSE: 160.61816332964435
p=0.71, RMSE: 166.40325227393856
p=0.73, RMSE: 172.8543926471356
p=0.75, RMSE: 180.10744012827092
p=0.77, RMSE: 188.38298917803746
p=0.79, RMSE: 179.48093810988075
p=0.81, RMSE: 178.4015834083836
p=0.84, RMSE: 164.70023200022203
p=0.86, RMSE: 117.78734933719966
p=0.88, RMSE: 78.2763825713911
p=0.90, RMSE: 35.643950150459396
