In [1]:
import os
import sys

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

from helpers.helpers import load_and_prepare_csv

In [3]:
TICKER = 'SQQQ'
DATA_PATH = os.path.join('outputs', 'data', f'{TICKER}.csv')
df = load_and_prepare_csv(DATA_PATH)

df.head()


Unnamed: 0_level_0,SQQQ_OPEN,SQQQ_HIGH,SQQQ_LOW,SQQQ_CLOSE,SQQQ_ADJ_CLOSE,SQQQ_VOLUME,NASDAQ 100_OPEN,NASDAQ 100_HIGH,NASDAQ 100_LOW,NASDAQ 100_CLOSE,...,Dow 30_LOW,Dow 30_CLOSE,Dow 30_ADJ_CLOSE,Dow 30_VOLUME,Russell 2000_OPEN,Russell 2000_HIGH,Russell 2000_LOW,Russell 2000_CLOSE,Russell 2000_ADJ_CLOSE,Russell 2000_VOLUME
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-11,515008.0,520000.0,488384.0,492672.0,430120.6875,8.0,1747.550049,1780.050049,1739.02002,1775.73999,...,9976.709961,10144.19043,10144.19043,194470000.0,594.299988,605.599976,590.609985,605.460022,605.460022,4400870000.0
2010-02-12,507328.0,507712.0,486080.0,488576.0,426544.71875,8.0,1761.290039,1783.699951,1756.650024,1779.109985,...,9983.820312,10099.139648,10099.139648,296510000.0,600.919983,610.719971,598.030029,610.719971,610.719971,4160680000.0
2010-02-16,477952.0,485952.0,469760.0,471232.0,411402.71875,7.0,1792.900024,1802.47998,1782.72998,1802.060059,...,10100.80957,10268.80957,10268.80957,234900000.0,613.799988,620.840027,611.580017,620.840027,620.840027,4080770000.0
2010-02-17,464960.0,471936.0,462656.0,464000.0,405089.0625,7.0,1808.550049,1811.199951,1799.25,1810.859985,...,10261.480469,10309.240234,10309.240234,193270000.0,622.409973,625.169983,620.330017,624.830017,624.830017,4259230000.0
2010-02-18,463424.0,465600.0,451520.0,452864.0,395366.8125,7.0,1809.01001,1825.819946,1806.76001,1823.390015,...,10294.509766,10392.900391,10392.900391,185310000.0,623.599976,629.320007,622.890015,629.320007,629.320007,3878620000.0


In [None]:
# Features and target
X = df.drop(f'{TICKER}_CLOSE', axis=1)
y = df[f'{TICKER}_CLOSE']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Training the model
rf.fit(X_train, y_train)

# Making predictions
y_pred = rf.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Predicting future prices
# Ensure future_data is correctly formatted
future_data = {col_name: [value] for col_name, value in df.iloc[-1].items()}

# Create a DataFrame with the appropriate shape
future_df = pd.DataFrame(future_data)

# Make sure the future_df has the same columns as the training data
future_df = future_df[X.columns]

# Predict the future price
future_price_pred = rf.predict(future_df)

print(f"Predicted Future Price: {future_price_pred[0]}")
