In [285]:
from sklearn.model_selection import train_test_split
import pandas as pd
import pendulum
from google.cloud import bigquery
from fastapi import FastAPI, HTTPException
import uvicorn
from xgboost import XGBRegressor
import joblib
from get_consume_data import get_raw_data
import logging
from shared_functions import bigquery_client
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
import numpy as np
from sklearn.ensemble import RandomForestRegressor



logging.basicConfig(level=logging.INFO)

In [255]:
dataset_id = "shabubsinc_db"
view_id = "mview_consume"

In [256]:
df = get_raw_data(bigquery_client= bigquery_client,dataset_id=dataset_id,view_id=view_id)

INFO:root:Running query on view: mview_consume in dataset: shabubsinc_db
INFO:root:Retrieved 65456 rows from view mview_consume


In [257]:
df = df.sort_values(by='time_period_start', ascending=True)

In [258]:
df.head()

Unnamed: 0,time_period_start,time_period_end,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count,fear_greed_value,fear_greed_classification
312,2017-02-01 00:00:00+00:00,2017-02-01 01:00:00+00:00,2017-02-01 00:00:02+00:00,2017-02-01 00:59:18+00:00,963.99,968.69,963.45,968.69,232.478695,199,0,Unknown
6981,2017-02-01 01:00:00+00:00,2017-02-01 02:00:00+00:00,2017-02-01 01:00:31+00:00,2017-02-01 01:59:56+00:00,968.69,976.31,967.89,975.64,483.945799,324,0,Unknown
3407,2017-02-01 02:00:00+00:00,2017-02-01 03:00:00+00:00,2017-02-01 02:00:02+00:00,2017-02-01 02:59:57+00:00,975.03,975.3,962.93,969.98,476.365029,327,0,Unknown
2,2017-02-01 03:00:00+00:00,2017-02-01 04:00:00+00:00,2017-02-01 03:00:12+00:00,2017-02-01 03:59:53+00:00,969.11,971.94,968.48,969.95,268.37259,242,0,Unknown
1238,2017-02-01 04:00:00+00:00,2017-02-01 05:00:00+00:00,2017-02-01 04:00:02+00:00,2017-02-01 04:58:07+00:00,969.51,969.94,962.56,968.17,130.829892,147,0,Unknown


In [259]:
df.drop(columns=["time_open", "time_close"], inplace=True)

In [260]:
df.head()

Unnamed: 0,time_period_start,time_period_end,price_open,price_high,price_low,price_close,volume_traded,trades_count,fear_greed_value,fear_greed_classification
312,2017-02-01 00:00:00+00:00,2017-02-01 01:00:00+00:00,963.99,968.69,963.45,968.69,232.478695,199,0,Unknown
6981,2017-02-01 01:00:00+00:00,2017-02-01 02:00:00+00:00,968.69,976.31,967.89,975.64,483.945799,324,0,Unknown
3407,2017-02-01 02:00:00+00:00,2017-02-01 03:00:00+00:00,975.03,975.3,962.93,969.98,476.365029,327,0,Unknown
2,2017-02-01 03:00:00+00:00,2017-02-01 04:00:00+00:00,969.11,971.94,968.48,969.95,268.37259,242,0,Unknown
1238,2017-02-01 04:00:00+00:00,2017-02-01 05:00:00+00:00,969.51,969.94,962.56,968.17,130.829892,147,0,Unknown


In [261]:
df['time_period_start'] = pd.to_datetime(df['time_period_start'])
df['time_period_end'] = pd.to_datetime(df['time_period_end'])

df['year'] = df['time_period_start'].dt.year
df['month'] = df['time_period_start'].dt.month
df['day'] = df['time_period_start'].dt.day
df['hour'] = df['time_period_start'].dt.hour

df['time_period_start'] = df['time_period_start'].astype(str).str.split('+').str[0]
df['time_period_end'] = df['time_period_end'].astype(str).str.split('+').str[0]

In [262]:
df.reset_index(inplace=True, drop=True)

In [263]:
df.head()

Unnamed: 0,time_period_start,time_period_end,price_open,price_high,price_low,price_close,volume_traded,trades_count,fear_greed_value,fear_greed_classification,year,month,day,hour
0,2017-02-01 00:00:00,2017-02-01 01:00:00,963.99,968.69,963.45,968.69,232.478695,199,0,Unknown,2017,2,1,0
1,2017-02-01 01:00:00,2017-02-01 02:00:00,968.69,976.31,967.89,975.64,483.945799,324,0,Unknown,2017,2,1,1
2,2017-02-01 02:00:00,2017-02-01 03:00:00,975.03,975.3,962.93,969.98,476.365029,327,0,Unknown,2017,2,1,2
3,2017-02-01 03:00:00,2017-02-01 04:00:00,969.11,971.94,968.48,969.95,268.37259,242,0,Unknown,2017,2,1,3
4,2017-02-01 04:00:00,2017-02-01 05:00:00,969.51,969.94,962.56,968.17,130.829892,147,0,Unknown,2017,2,1,4


In [264]:
# Define the split ratio (e.g., 80% train, 20% test)
split_ratio = 0.8

# Get the index to split the data
split_index = int(len(df) * split_ratio)

# Split into train and test sets
train_data = df[:split_index]
test_data = df[split_index:]

# Check the shapes of the splits
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Train data shape: (52364, 14)
Test data shape: (13092, 14)


In [265]:
X_train, y_train = train_data.drop(columns=["price_open", "price_close"]), train_data[["price_close"]]
X_test, y_test = test_data.drop(columns=["price_open", "price_close"]), test_data[["price_close"]]

In [266]:
numeric_cols = [
        'price_high',
        'price_low',
        'volume_traded',
        'trades_count',
        'fear_greed_value',
        ]

In [267]:
numeric_cols

['price_high',
 'price_low',
 'volume_traded',
 'trades_count',
 'fear_greed_value']

In [268]:
scaler = StandardScaler().fit(X_train[numeric_cols])

In [269]:
X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [270]:
X_train.tail()

Unnamed: 0,time_period_start,time_period_end,price_high,price_low,volume_traded,trades_count,fear_greed_value,fear_greed_classification,year,month,day,hour
52359,2023-03-20 16:00:00,2023-03-20 17:00:00,0.581552,0.582798,-0.249946,0.958344,1.186598,Greed,2023,3,20,16
52360,2023-03-20 17:00:00,2023-03-20 18:00:00,0.583613,0.584576,-0.484822,0.08119,1.186598,Greed,2023,3,20,17
52361,2023-03-20 18:00:00,2023-03-20 19:00:00,0.59798,0.590463,-0.245041,0.44585,1.186598,Greed,2023,3,20,18
52362,2023-03-20 19:00:00,2023-03-20 20:00:00,0.596161,0.593897,-0.406879,0.534551,1.186598,Greed,2023,3,20,19
52363,2023-03-20 20:00:00,2023-03-20 21:00:00,0.597131,0.601133,-0.383592,0.362077,1.186598,Greed,2023,3,20,20


In [271]:
columns_to_encode = ["fear_greed_classification"]

In [272]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(
    X_train[columns_to_encode]
)

encoded_cols = list(encoder.get_feature_names_out(columns_to_encode))

In [273]:
encoded_cols

['fear_greed_classification_Extreme Fear',
 'fear_greed_classification_Extreme Greed',
 'fear_greed_classification_Fear',
 'fear_greed_classification_Greed',
 'fear_greed_classification_Neutral',
 'fear_greed_classification_Unknown']

In [274]:
X_train[encoded_cols] = encoder.transform(X_train[columns_to_encode])
X_test[encoded_cols] = encoder.transform(X_test[columns_to_encode])


In [275]:
X_train.drop(columns="fear_greed_classification", inplace=True)
X_test.drop(columns="fear_greed_classification", inplace=True)

In [276]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price_high,52364.0,2.388196e-17,1.00001,-1.05171,-0.700599,-0.484135,0.359043,3.07596
price_low,52364.0,-5.4277190000000004e-17,1.00001,-1.0523,-0.700707,-0.484142,0.360517,3.09042
volume_traded,52364.0,-6.513263e-17,1.00001,-0.725114,-0.549984,-0.316432,0.159642,30.771254
trades_count,52364.0,-7.815916000000001e-17,1.00001,-1.004904,-0.583081,-0.302195,0.217445,17.233986
fear_greed_value,52364.0,2.6053050000000002e-17,1.00001,-1.41869,-0.668683,-0.15552,0.673435,2.331346
year,52364.0,2019.703,1.760798,2017.0,2018.0,2020.0,2021.0,2023.0
month,52364.0,6.434726,3.491273,1.0,3.0,6.0,10.0,12.0
day,52364.0,15.6691,8.783233,1.0,8.0,16.0,23.0,31.0
hour,52364.0,11.49257,6.922302,0.0,5.0,11.0,17.0,23.0
fear_greed_classification_Extreme Fear,52364.0,0.240948,0.427663,0.0,0.0,0.0,0.0,1.0


In [277]:
X_train.head()

Unnamed: 0,time_period_start,time_period_end,price_high,price_low,volume_traded,trades_count,fear_greed_value,year,month,day,hour,fear_greed_classification_Extreme Fear,fear_greed_classification_Extreme Greed,fear_greed_classification_Fear,fear_greed_classification_Greed,fear_greed_classification_Neutral,fear_greed_classification_Unknown
0,2017-02-01 00:00:00,2017-02-01 01:00:00,-1.048025,-1.047877,-0.190419,-0.813704,-1.41869,2017,2,1,0,0.0,0.0,0.0,0.0,0.0,1.0
1,2017-02-01 01:00:00,2017-02-01 02:00:00,-1.047563,-1.047605,0.387991,-0.690508,-1.41869,2017,2,1,1,0.0,0.0,0.0,0.0,0.0,1.0
2,2017-02-01 02:00:00,2017-02-01 03:00:00,-1.047624,-1.047909,0.370555,-0.687551,-1.41869,2017,2,1,2,0.0,0.0,0.0,0.0,0.0,1.0
3,2017-02-01 03:00:00,2017-02-01 04:00:00,-1.047828,-1.047569,-0.107858,-0.771324,-1.41869,2017,2,1,3,0.0,0.0,0.0,0.0,0.0,1.0
4,2017-02-01 04:00:00,2017-02-01 05:00:00,-1.047949,-1.047932,-0.424226,-0.864953,-1.41869,2017,2,1,4,0.0,0.0,0.0,0.0,0.0,1.0


In [278]:

# Function to create time features, including time to predict
def create_time_features(df, prediction_offset_in_hours=1):
    # Convert time_period_start and time_period_end to Unix timestamps (seconds since epoch)
    df["time_period_start_unix"] = (
        pd.to_datetime(df["time_period_start"]).astype(int) / 10**9
    )
    df["time_period_end_unix"] = (
        pd.to_datetime(df["time_period_end"]).astype(int) / 10**9
    )

    # Calculate the duration between time_period_start and time_period_end (in seconds)
    df["duration"] = (
        pd.to_datetime(df["time_period_end"]) - pd.to_datetime(df["time_period_start"])
    ).dt.total_seconds()

    # Extract useful components (hour, day, month) from time_period_start
    df["start_hour"] = pd.to_datetime(df["time_period_start"]).dt.hour
    df["start_day"] = pd.to_datetime(df["time_period_start"]).dt.day
    df["start_month"] = pd.to_datetime(df["time_period_start"]).dt.month

    # Encode hour and day of month as cyclical features
    df["hour_sin"] = np.sin(2 * np.pi * df["start_hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["start_hour"] / 24)

    df["day_sin"] = np.sin(2 * np.pi * df["start_day"] / 30)
    df["day_cos"] = np.cos(2 * np.pi * df["start_day"] / 30)

    # Time to predict: Add prediction offset to time_period_end to get target time
    prediction_offset_seconds = (
        prediction_offset_in_hours * 3600
    )  # Convert hours to seconds
    df["time_to_predict_unix"] = df["time_period_end_unix"] + prediction_offset_seconds

    # Return the modified DataFrame
    return df


# Apply the function to both X_train and X_test
X_train = create_time_features(
    X_train, prediction_offset_in_hours=1
)  # Change offset as needed
X_test = create_time_features(X_test, prediction_offset_in_hours=1)

# Now remove the original columns that we used to create the features
columns_to_remove = ['time_period_start', 'time_period_end', 'start_hour', 'start_day', 'start_month', "year", "month", "day", "hour"]

X_train = X_train.drop(columns=columns_to_remove)
X_test = X_test.drop(columns=columns_to_remove)



In [280]:
X_train.head()

Unnamed: 0,price_high,price_low,volume_traded,trades_count,fear_greed_value,fear_greed_classification_Extreme Fear,fear_greed_classification_Extreme Greed,fear_greed_classification_Fear,fear_greed_classification_Greed,fear_greed_classification_Neutral,fear_greed_classification_Unknown,time_period_start_unix,time_period_end_unix,duration,hour_sin,hour_cos,day_sin,day_cos,time_to_predict_unix
0,-1.048025,-1.047877,-0.190419,-0.813704,-1.41869,0.0,0.0,0.0,0.0,0.0,1.0,1485907000.0,1485911000.0,3600.0,0.0,1.0,0.207912,0.978148,1485914000.0
1,-1.047563,-1.047605,0.387991,-0.690508,-1.41869,0.0,0.0,0.0,0.0,0.0,1.0,1485911000.0,1485914000.0,3600.0,0.258819,0.965926,0.207912,0.978148,1485918000.0
2,-1.047624,-1.047909,0.370555,-0.687551,-1.41869,0.0,0.0,0.0,0.0,0.0,1.0,1485914000.0,1485918000.0,3600.0,0.5,0.866025,0.207912,0.978148,1485922000.0
3,-1.047828,-1.047569,-0.107858,-0.771324,-1.41869,0.0,0.0,0.0,0.0,0.0,1.0,1485918000.0,1485922000.0,3600.0,0.707107,0.707107,0.207912,0.978148,1485925000.0
4,-1.047949,-1.047932,-0.424226,-0.864953,-1.41869,0.0,0.0,0.0,0.0,0.0,1.0,1485922000.0,1485925000.0,3600.0,0.866025,0.5,0.207912,0.978148,1485929000.0


In [282]:
y_test.head()

Unnamed: 0,price_close
52364,28076.0
52365,28048.0
52366,27815.0
52367,27938.0
52368,27953.0


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Initialize the Random Forest Regressor
rf = RandomForestRegressor()

# Define a grid of hyperparameters to search
param_grid = {
    "n_estimators": [100, 200, 300],  # Number of trees in the forest
    "max_depth": [10, 20, 30, None],  # Maximum depth of each tree
    "min_samples_split": [
        2,
        5,
        10,
    ],  # Minimum number of samples required to split an internal node
    "min_samples_leaf": [
        1,
        2,
        4,
    ],  # Minimum number of samples required to be at a leaf node
}

# Set up GridSearchCV to tune hyperparameters
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring="neg_mean_squared_error",
)

# Fit GridSearchCV to find the best hyperparameters on X_train and y_train
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")



In [None]:
best_rf = grid_search.best_estimator_

# Make predictions on X_test
test_predictions = best_rf.predict(X_test)


In [None]:
# Evaluate the model using Mean Squared Error on the test set
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = root_mean_squared_error(y_test, test_predictions)
print(f"Test Mean Squared Error: {test_mse}")
print(f"Root Mean Squared Error: {test_rmse}")