In [4]:
import pandas as pd
from prophet import Prophet
from data_extraction.dummy_data_extractor import extract_dummy_data

def facebook_pred(df, length_of_missing_data, data_logging_interval, dqStart):
    df.reset_index(inplace=True)
    df = df.dropna()

    # Keep only the first two columns
    df = df.iloc[:, :2]

    # Rename columns
    df.columns = ['ds', 'temp']

    # Remove ' Dubai' from the datetime strings
    df['ds'] = df['ds'].str.replace(' Dubai', '', regex=False)

    # Convert the 'ds' column to datetime format
    df['ds'] = pd.to_datetime(df['ds'], format="%Y-%m-%dT%H:%M:%S%z")

    # Drop rows where datetime parsing failed
    df = df.dropna(subset=['ds'])

    # Clean temperature column and convert to numeric
    df['temp'] = df['temp'].str.replace('°C', '').astype(float)

    # Rename columns for convenience
    df.columns = ['ds', 'y']

    # Ensure 'ds' column is timezone-naive
    df['ds'] = df['ds'].dt.tz_localize(None)

    # Initialize Prophet model with tuned hyperparameters
    model_temp = Prophet(seasonality_mode='additive',  # Adjust based on data exploration
                         interval_width=0.95,          # Adjust prediction interval if needed
                         changepoint_prior_scale=0.01) # Tune based on data patterns

    # Fit the model
    model_temp.fit(df)

    # Number of predictions
    samples = int(length_of_missing_data / data_logging_interval) + 1

    # Create future DataFrame
    future_temp = model_temp.make_future_dataframe(periods=samples, freq='5T')

    # Predict the future values
    forecast_temp = model_temp.predict(future_temp)

    # Convert dq_start to timezone-naive
    dq_start = pd.Timestamp(dqStart, tz='Asia/Dubai').tz_localize(None)

    # Ensure 'ds' column in forecast_temp is timezone-naive
    forecast_temp['ds'] = forecast_temp['ds'].dt.tz_localize(None)

    # Filter predictions to start from dq_start
    predictions = forecast_temp[forecast_temp['ds'] >= dq_start][['ds', 'yhat']]

    # Set 'ds' as the index
    predictions.set_index('ds', inplace=True)

    return predictions

# Example usage:
# Replace these values with your actual data and variables
master_table = extract_dummy_data("dummy_data")
df = master_table.at[1, "his"].iloc[:, :2].copy()

length_of_missing_data = pd.Timedelta('0 days 23:30:00')
data_logging_interval = pd.Timedelta('0 days 00:05:00')
dqStart = '2023-03-19 01:10:00'

# Call the function
predictions = facebook_pred(df, length_of_missing_data, data_logging_interval, dqStart)

# Display the resulting dataframe
print(predictions.head())


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")
09:14:35 - cmdstanpy - INFO - Chain [1] start processing
09:14:35 - cmdstanpy - INFO - Chain [1] done processing


                          yhat
ds                            
2023-03-19 01:10:00  22.503415
2023-03-19 01:15:00  22.493035
2023-03-19 01:20:00  22.482317
2023-03-19 01:25:00  22.471234
2023-03-19 01:30:00  22.459765


In [22]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from datetime import datetime, timedelta, timezone

def xgboost_1(df, length_of_missing_data, data_logging_interval, dqStart):
    df.reset_index(inplace=True)
    df = df.dropna()

    # Keep only the first two columns
    df = df.iloc[:, :2]

    # Rename columns
    df.columns = ['ds', 'temp']

    # Remove ' Dubai' from the datetime strings
    df['ds'] = df['ds'].astype(str).str.replace(' Dubai', '', regex=False)

    # Convert the 'ds' column to datetime format
    df['ds'] = pd.to_datetime(df['ds'], format="%Y-%m-%dT%H:%M:%S%z")

    # Drop rows where datetime parsing failed
    df = df.dropna(subset=['ds'])

    # Clean temperature column and convert to numeric
    df['temp'] = df['temp'].str.replace('°C', '').astype(float)

    # Rename columns for convenience
    df.columns = ['ds', 'y']

    # Ensure 'ds' column is timezone-naive
    df['ds'] = df['ds'].dt.tz_localize(None)

    # Extract numerical features from datetime
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['day'] = df['ds'].dt.day
    df['hour'] = df['ds'].dt.hour
    df['minute'] = df['ds'].dt.minute

    # Create future DataFrame starting from dq_start
    future_periods = int(length_of_missing_data / data_logging_interval) + 1
    dq_start = pd.Timestamp(dqStart, tz='Asia/Dubai').tz_localize(None)
    future_temp = pd.DataFrame()
    future_temp['ds'] = [dq_start + timedelta(minutes=5 * i) for i in range(future_periods)]

    # Extract features for XGBoost
    future_temp['year'] = future_temp['ds'].dt.year
    future_temp['month'] = future_temp['ds'].dt.month
    future_temp['day'] = future_temp['ds'].dt.day
    future_temp['hour'] = future_temp['ds'].dt.hour
    future_temp['minute'] = future_temp['ds'].dt.minute

    # Initialize XGBoost model
    model_temp = xgb.XGBRegressor()

    # Fit the model
    model_temp.fit(df[['year', 'month', 'day', 'hour', 'minute']], df['y'])

    # Predict the future values
    future_temp['yhat'] = model_temp.predict(future_temp[['year', 'month', 'day', 'hour', 'minute']])

    # Filter predictions to start from dq_start
    predictions = future_temp[['ds', 'yhat']]

    # Set 'ds' as the index
    predictions.set_index('ds', inplace=True)

    return predictions

# Example usage:
# Replace these values with your actual data and variables
master_table = extract_dummy_data("dummy_data")
df = master_table.at[1, "his"].iloc[:, :2].copy()

length_of_missing_data = pd.Timedelta('0 days 23:30:00')
data_logging_interval = pd.Timedelta('0 days 00:05:00')
dqStart = '2023-03-19 01:10:00'

# Call the function
predictions = xgboost_1(df, length_of_missing_data, data_logging_interval, dqStart)

# Display the resulting dataframe
print(predictions.head())


                          yhat
ds                            
2023-03-19 01:10:00  22.641504
2023-03-19 01:15:00  22.641407
2023-03-19 01:20:00  22.652977
2023-03-19 01:25:00  22.648750
2023-03-19 01:30:00  22.621050


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")


In [24]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from datetime import datetime, timedelta, timezone

def xgboost_1(df, length_of_missing_data, data_logging_interval, dqStart):
    df.reset_index(inplace=True)
    df = df.dropna()

    # Keep only the first two columns
    df = df.iloc[:, :2]

    # Rename columns
    df.columns = ['ds', 'temp']

    # Remove ' Dubai' from the datetime strings
    df['ds'] = df['ds'].astype(str).str.replace(' Dubai', '', regex=False)

    # Convert the 'ds' column to datetime format
    df['ds'] = pd.to_datetime(df['ds'], format="%Y-%m-%dT%H:%M:%S%z")

    # Drop rows where datetime parsing failed
    df = df.dropna(subset=['ds'])

    # Clean temperature column and convert to numeric
    df['temp'] = df['temp'].str.replace('°C', '').astype(float)

    # Rename columns for convenience
    df.columns = ['ds', 'y']

    # Ensure 'ds' column is timezone-naive
    df['ds'] = df['ds'].dt.tz_localize(None)

    # Extract numerical features from datetime
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['day'] = df['ds'].dt.day
    df['hour'] = df['ds'].dt.hour
    df['minute'] = df['ds'].dt.minute

    # Create future DataFrame starting from dq_start
    future_periods = int(length_of_missing_data / data_logging_interval) + 1
    dq_start = pd.Timestamp(dqStart, tz='Asia/Dubai').tz_localize(None)
    future_temp = pd.DataFrame()
    future_temp['ds'] = [dq_start + timedelta(minutes=5 * i) for i in range(future_periods)]

    # Extract features for XGBoost
    future_temp['year'] = future_temp['ds'].dt.year
    future_temp['month'] = future_temp['ds'].dt.month
    future_temp['day'] = future_temp['ds'].dt.day
    future_temp['hour'] = future_temp['ds'].dt.hour
    future_temp['minute'] = future_temp['ds'].dt.minute

    ## Initialize XGBoost model with parameters to reduce noise
    model_temp = xgb.XGBRegressor(
        n_estimators=100,   # Number of boosting rounds
        max_depth=3,        # Maximum depth of each tree
        learning_rate=0.1,  # Learning rate
        min_child_weight=1, # Minimum sum of instance weight needed in a child
        subsample=0.8,      # Subsample ratio of the training instances
        colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
        objective='reg:squarederror'  # Objective function for regression task
    )

    # Fit the model
    model_temp.fit(df[['year', 'month', 'day', 'hour', 'minute']], df['y'])

    # Predict the future values
    future_temp['yhat'] = model_temp.predict(future_temp[['year', 'month', 'day', 'hour', 'minute']])

    # Filter predictions to start from dq_start
    predictions = future_temp[['ds', 'yhat']]

    # Set 'ds' as the index
    predictions.set_index('ds', inplace=True)

    return predictions

# Example usage:
# Replace these values with your actual data and variables
master_table = extract_dummy_data("dummy_data")
df = master_table.at[1, "his"].iloc[:, :2].copy()

length_of_missing_data = pd.Timedelta('0 days 23:30:00')
data_logging_interval = pd.Timedelta('0 days 00:05:00')
dqStart = '2023-03-19 01:10:00'

# Call the function
predictions = xgboost_1(df, length_of_missing_data, data_logging_interval, dqStart)

# Display the resulting dataframe
print(predictions.head())


                          yhat
ds                            
2023-03-19 01:10:00  22.747757
2023-03-19 01:15:00  22.740808
2023-03-19 01:20:00  22.694866
2023-03-19 01:25:00  22.684923
2023-03-19 01:30:00  22.666439


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from datetime import datetime, timedelta, timezone

def xgboost_1(df, length_of_missing_data, data_logging_interval, dqStart):
    df.reset_index(inplace=True)
    df = df.dropna()

    # Keep only the first two columns
    df = df.iloc[:, :2]

    # Rename columns
    df.columns = ['ds', 'temp']

    # Remove ' Dubai' from the datetime strings
    df['ds'] = df['ds'].astype(str).str.replace(' Dubai', '', regex=False)

    # Convert the 'ds' column to datetime format
    df['ds'] = pd.to_datetime(df['ds'], format="%Y-%m-%dT%H:%M:%S%z")

    # Drop rows where datetime parsing failed
    df = df.dropna(subset=['ds'])

    # Clean temperature column and convert to numeric
    df['temp'] = df['temp'].str.replace('°C', '').astype(float)

    # Rename columns for convenience
    df.columns = ['ds', 'y']

    # Ensure 'ds' column is timezone-naive
    df['ds'] = df['ds'].dt.tz_localize(None)

    # Extract numerical features from datetime
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['day'] = df['ds'].dt.day
    df['hour'] = df['ds'].dt.hour
    df['minute'] = df['ds'].dt.minute

    # Create future DataFrame starting from dq_start
    future_periods = int(length_of_missing_data / data_logging_interval) + 1
    dq_start = pd.Timestamp(dqStart, tz='Asia/Dubai').tz_localize(None)
    future_temp = pd.DataFrame()
    future_temp['ds'] = [dq_start + timedelta(minutes=5 * i) for i in range(future_periods)]

    # Extract features for XGBoost
    future_temp['year'] = future_temp['ds'].dt.year
    future_temp['month'] = future_temp['ds'].dt.month
    future_temp['day'] = future_temp['ds'].dt.day
    future_temp['hour'] = future_temp['ds'].dt.hour
    future_temp['minute'] = future_temp['ds'].dt.minute

    # Initialize XGBoost model with parameters to reduce noise
    model_temp = xgb.XGBRegressor(
        n_estimators=90,   # Number of boosting rounds
        max_depth=1,        # Maximum depth of each tree
        learning_rate=0.1,  # Learning rate
        min_child_weight=1, # Minimum sum of instance weight needed in a child
        subsample=0.8,      # Subsample ratio of the training instances
        colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
        objective='reg:squarederror'  # Objective function for regression task
    )

    # Fit the model
    model_temp.fit(df[['year', 'month', 'day', 'hour', 'minute']], df['y'])

    # Predict the future values
    future_temp['yhat'] = model_temp.predict(future_temp[['year', 'month', 'day', 'hour', 'minute']])

    # Filter predictions to start from dq_start
    predictions = future_temp[['ds', 'yhat']]

    # Set 'ds' as the index
    predictions.set_index('ds', inplace=True)

    return predictions

# Example usage:
# Replace these values with your actual data and variables
master_table = extract_dummy_data("dummy_data")
df = master_table.at[1, "his"].iloc[:, :2].copy()

length_of_missing_data = pd.Timedelta('0 days 23:30:00')
data_logging_interval = pd.Timedelta('0 days 00:05:00')
dqStart = '2023-03-19 01:10:00'

# Call the function
predictions = xgboost_1(df, length_of_missing_data, data_logging_interval, dqStart)

# Display the resulting dataframe
print(predictions.head())
