In [3]:
import pandas as pd
import xgboost as xgb
import numpy as n
from sklearn.metrics import mean_squared_error
import numpy as np
from datetime import datetime, timedelta, timezone
from data_extraction.dummy_data_extractor import extract_dummy_data

def xgboost_2(df, length_of_missing_data, data_logging_interval, dqStart):
  
    # Keep only the first two columns
    df = df.iloc[:, :2]

    df.set_index(df.columns[0], inplace=True, drop=True)
    
    df = df[df.index < dqStart]

    df.reset_index(inplace=True)

    # Rename columns
    df.columns = ['ds','y']

 

    # Extract numerical features from datetime
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['day'] = df['ds'].dt.day
    df['hour'] = df['ds'].dt.hour
    df['minute'] = df['ds'].dt.minute

    # Create future DataFrame starting from dq_start
    future_periods = int(length_of_missing_data / data_logging_interval) + 1
   

    future_temp = pd.DataFrame()
    future_temp['ds'] = [dqStart + timedelta(minutes=5 * i) for i in range(future_periods)]


    # Extract features for XGBoost
    future_temp['year'] = future_temp['ds'].dt.year
    future_temp['month'] = future_temp['ds'].dt.month
    future_temp['day'] = future_temp['ds'].dt.day
    future_temp['hour'] = future_temp['ds'].dt.hour
    future_temp['minute'] = future_temp['ds'].dt.minute

    model_temp = xgb.XGBRegressor(
        n_estimators=100,   # Number of boosting rounds
        max_depth=3,        # Maximum depth of each tree
        learning_rate=0.1,  # Learning rate
        min_child_weight=1, # Minimum sum of instance weight needed in a child
        subsample=0.8,      # Subsample ratio of the training instances
        colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
        objective='reg:squarederror'  # Objective function for regression task
    )

    # Fit the model
    model_temp.fit(df[['year', 'month', 'day', 'hour', 'minute']], df['y'])

    # Predict the future values
    future_temp['yhat'] = model_temp.predict(future_temp[['year', 'month', 'day', 'hour', 'minute']])

    # Filter predictions to start from dq_start
    predictions = future_temp[['ds', 'yhat']]

    # Set 'ds' as the index
    predictions.set_index('ds', inplace=True)

    return predictions



In [5]:
# Example usage:
master_table = extract_dummy_data("dummy_data")
df = master_table.at[0, "his"].iloc[:, :2].copy()

# Extract values from the second row of master_table
length_of_missing_data = master_table.at[0, "dqDuration"]
data_logging_interval = master_table.at[0, "pointInterval"]
dqStart = master_table.at[0, "dqStart"]

# Call the function
predictions = xgboost_2(df, length_of_missing_data, data_logging_interval, dqStart)

# Display the resulting dataframe
print(predictions)

                                yhat
ds                                  
2023-05-09 23:55:00+04:00  15.957184
2023-05-10 00:00:00+04:00  17.960670
2023-05-10 00:05:00+04:00  18.744310
2023-05-10 00:10:00+04:00  17.010210
2023-05-10 00:15:00+04:00  16.869192
...                              ...
2023-05-11 23:40:00+04:00  15.558950
2023-05-11 23:45:00+04:00  15.573825
2023-05-11 23:50:00+04:00  15.573825
2023-05-11 23:55:00+04:00  15.564921
2023-05-12 00:00:00+04:00  17.480892

[578 rows x 1 columns]


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")
