In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Function to create lag features and rolling statistics
def create_lag_features(data, lags):
    for lag in range(1, lags + 1):
        data[f'lag_{lag}'] = data['Close'].pct_change(lag)  # Percentage change
    data['rolling_mean'] = data['Close'].rolling(window=5).mean()
    data['rolling_std'] = data['Close'].rolling(window=5).std()
    return data

# Function to split the dataset into training and testing sets
def split_data(df, training_period, train_ratio=0.8, idx = None):
    # Create lag features and rolling statistics
    df = create_lag_features(df, training_period)
    df.dropna(inplace=True)  # Remove rows with NaN values

    # Scale features
    feature_scaler = StandardScaler()
    scaled_features = feature_scaler.fit_transform(df.drop(columns=['Close']))
    scaled_data = pd.DataFrame(scaled_features, columns=df.drop(columns=['Close']).columns, index=df.index)
    scaled_data['Close'] = df['Close']  # Keep original target for splitting

    # Scale the target variable
    target_scaler = StandardScaler()
    scaled_data['Close'] = target_scaler.fit_transform(df['Close'].values.reshape(-1, 1)).ravel()
    if idx:
        split_idx = idx
        train_data = scaled_data.iloc[:split_idx]
        test_data = scaled_data.iloc[split_idx:]
    else:
        # Split data into training and testing sets
        split_idx = int(len(df) * train_ratio)
        train_data = scaled_data.iloc[:split_idx]
        test_data = scaled_data.iloc[split_idx:]
    # Separate features and targets
    X_train = train_data.drop(columns=['Close'])
    y_train = train_data['Close']
    X_test = test_data.drop(columns=['Close'])
    y_test = test_data['Close']

    return X_train, y_train, X_test, y_test, test_data.index, feature_scaler, target_scaler

# Function to train the Random Forest model
def train_random_forest(X_train, y_train):
    model = RandomForestRegressor(
        n_estimators=300,
        max_depth=80,
        random_state=42
    )
    model.fit(X_train, y_train)
    return model

# Function to calculate log-return volatility
def calculate_log_volatility(y_values):
    log_returns = np.log(y_values[1:] / y_values[:-1])  # Logarithmic returns
    return np.std(log_returns)  # Volatility as standard deviation of log returns

# Updated evaluate function with average MSE in the legend
def evaluate_multiple_start_points(model, X_test, y_test, test_index, n_days_to_predict, num_start_points, feature_scaler, target_scaler):
    start_indices = [0]#range(0, 30, 5)  # Manually specify starting indices
    for start_idx in start_indices:
        end_idx = start_idx + n_days_to_predict

        # Ensure the model always gets the past 30 days of data
        context_idx = start_idx - 30
        X_context = X_test.iloc[context_idx:start_idx]  # Past 30 days
        X_sub = X_train.iloc[-30:] # recheck on start_25
        print("xs",X_sub)
        y_true = y_test.iloc[start_idx:end_idx]

        # Reshape for LSTM input
        X_sub = X_sub.values.reshape((X_sub.shape[0], X_sub.shape[1]))
        print("xs",X_sub.shape)

        predictions = model.predict(X_sub)
        print(predictions.shape)
        predictions = target_scaler.inverse_transform(predictions.reshape(-1, 1)).ravel()
        


        # Plot individual predictions
        plt.figure(figsize=(10, 6))

        plt.plot(np.arange(len(predictions)), predictions, label="Predictions", color="orange")
        plt.xlabel("Date")
        plt.ylabel("Close Price")
        plt.title(f"Predictions from Index {start_idx} to {end_idx}")
        plt.legend()
        os.makedirs("../TSLA_pred/randomforest", exist_ok=True)
        plt.savefig(f"../TSLA_pred/randomforest/future.png")
        plt.show()
        plt.close()





# Main function
#def main():
training_period = 30  # Number of days for lag features
n_days_to_predict = 30  # Number of days to predict for each starting point
num_start_points = 30  # Number of unique starting points
path = "../TSLA.csv"  # Path to your stock data file

# Load and preprocess data
headers = ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"]
df = pd.read_csv(path, names=headers, skiprows=1)
df.replace("null", np.nan, inplace=True)
df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] = df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].astype(float)
df.set_index("Date", inplace=True)

# Split the dataset into training and testing sets
X_train, y_train, X_test, y_test, test_index, feature_scaler, target_scaler = split_data(df, training_period, idx = -1)


X_train = pd.concat((X_train, X_test))
y_train = pd.concat((y_train, y_test))

# Ensure X_test is an empty DataFrame with the same columns as X_train
X_test = pd.DataFrame(columns=X_train.columns)
# Ensure y_test is an empty Series with the same name as the original y_test
y_test = pd.Series(name=y_test.name)

# Train the XGBOOST model

input_shape = (X_train.shape[1], 1)  # Number of features as input shape
print("XT", X_train.shape)
print("YT", y_train.shape)
print("df", df.shape)

# Train the XGBoost model
model = train_random_forest(X_train, y_train)

# Evaluate predictions for multiple starting points
evaluate_multiple_start_points(model, X_test, y_test, test_index, n_days_to_predict, num_start_points, feature_scaler, target_scaler)

# Run the main function
#if __name__ == "__main__":
#    main()


XT (3175, 37)
YT (3175,)


KeyboardInterrupt: 