In [18]:
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error
import numpy as np
from data_extraction.dummy_data_extractor import extract_dummy_data

# Extract dummy data
master_table = extract_dummy_data("dummy_data")

# Select the relevant DataFrame
df = master_table.at[1, "his"]

# Reset index and drop NA values
df.reset_index(inplace=True)
df = df.dropna()

# Rename columns
df.columns = ['ds', 'temp', 'new_point']

# Remove ' Dubai' from the datetime strings
df['ds'] = df['ds'].str.replace(' Dubai', '', regex=False)

# Convert the 'ds' column to datetime format
df['ds'] = pd.to_datetime(df['ds'], format="%Y-%m-%dT%H:%M:%S%z")

# Print the DataFrame to check
print(df)


                            ds                  temp new_point
0    2023-03-14 21:10:00+04:00  24.269638061523438°C    33.13%
1    2023-03-14 21:15:00+04:00  24.269638061523438°C    33.13%
2    2023-03-14 21:20:00+04:00  24.269638061523438°C    33.13%
3    2023-03-14 21:25:00+04:00  24.269638061523438°C    33.13%
4    2023-03-14 21:30:00+04:00  24.269638061523438°C    33.13%
...                        ...                   ...       ...
1195 2023-03-19 00:45:00+04:00  22.633480072021484°C    29.43%
1196 2023-03-19 00:50:00+04:00  22.633480072021484°C    30.55%
1197 2023-03-19 00:55:00+04:00  22.633480072021484°C    30.51%
1198 2023-03-19 01:00:00+04:00  22.633480072021484°C    30.51%
1199 2023-03-19 01:05:00+04:00  22.633480072021484°C    30.51%

[1197 rows x 3 columns]


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ds'] = df['ds'].str.replace(' Dubai', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ds'] = pd.to_datetime(df['ds'], format="%Y-%m-%dT%H:%M:%S%z")


In [46]:
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error
import numpy as np
from data_extraction.dummy_data_extractor import extract_dummy_data
import pytz

# Extract dummy data
master_table = extract_dummy_data("dummy_data")

# Select the relevant DataFrame and make a deep copy to avoid SettingWithCopyWarning
df = master_table.at[1, "his"].copy()

# Reset index and drop NA values
df.reset_index(inplace=True)
df = df.dropna()

# Rename columns
df.columns = ['ts', 'temp', 'new_point']

# Remove ' Dubai' from the datetime strings using .loc to avoid SettingWithCopyWarning
df.loc[:, 'ts'] = df['ts'].str.replace(' Dubai', '', regex=False)

# Convert the 'ds' column to datetime format using .loc to avoid SettingWithCopyWarning
df.loc[:, 'ts'] = pd.to_datetime(df['ts'], format="%Y-%m-%dT%H:%M:%S%z")

# Clean temperature column and convert to numeric
df['temp'] = df['temp'].str.replace('°C', '').astype(float)

# Clean percentage column and convert to numeric
df['new_point'] = df['new_point'].str.replace('%', '').astype(float)

# Separate data for temperature and new_point
df_temp = df[['ts', 'temp']].rename(columns={'ts': 'ds', 'temp': 'y'})
df_new_point = df[['ts', 'new_point']].rename(columns={'ts': 'ds', 'new_point': 'y'})

# Ensure 'ds' column is timezone-naive
df_temp['ds'] = df_temp['ds'].dt.tz_localize(None)
df_new_point['ds'] = df_new_point['ds'].dt.tz_localize(None)

# Initialize Prophet models with tuned hyperparameters
model_temp = Prophet(seasonality_mode='additive', interval_width=0.95, changepoint_prior_scale=0.01)
model_new_point = Prophet(seasonality_mode='additive', interval_width=0.95, changepoint_prior_scale=0.01)

# Fit the models
model_temp.fit(df_temp)
model_new_point.fit(df_new_point)


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")


AttributeError: Can only use .dt accessor with datetimelike values

In [78]:
import pandas as pd
from prophet import Prophet
from data_extraction.dummy_data_extractor import extract_dummy_data
import numpy as np
from sklearn.metrics import mean_squared_error

# Extract dummy data
master_table = extract_dummy_data("dummy_data")

# Select the relevant DataFrame and make a deep copy to avoid SettingWithCopyWarning
df = master_table.at[1, "his"].copy()
# keep only the history BEFORE the start of the data quality issue, since this is a statistical model not ML model

length_of_missing_data = 400
data_logging_interval = 1

def fprophet(df, length_of_missing_data, data_logging_interval, dqStart):
    # Filter data before dqStart
    df['ts'] = df['ts'].str.replace(' Dubai', '', regex=False)
    
    # Reset index and drop NA values
    df.reset_index(inplace=True)
    df = df.dropna()

    # Rename columns for clarity
    df.columns = ['ts', 'temp', 'new_point']

    # Remove ' Dubai' from the datetime strings
    df['ds'] = pd.to_datetime(df['ts'], format="%Y-%m-%dT%H:%M:%S%z", errors='coerce')

    # Clean temperature column and convert to numeric
    df['temp'] = df['temp'].str.replace('°C', '').astype(float)

    # Clean percentage column and convert to numeric
    df['new_point'] = df['new_point'].str.replace('%', '').astype(float)

    # Drop rows where datetime conversion failed
    df = df.dropna(subset=['ds'])

    # Ensure 'ds' column is timezone-naive
    df['ds'] = df['ds'].dt.tz_localize(None)

    # Separate data for temperature and new_point
    df_temp = df[['ds', 'temp']].rename(columns={'temp': 'y'})
    df_new_point = df[['ds', 'new_point']].rename(columns={'new_point': 'y'})

    # Initialize Prophet models with tuned hyperparameters
    model_temp = Prophet(seasonality_mode='additive', interval_width=0.95, changepoint_prior_scale=0.001)
    model_new_point = Prophet(seasonality_mode='additive', interval_width=0.95, changepoint_prior_scale=0.001)

    # Fit the models
    model_temp.fit(df_temp)
    model_new_point.fit(df_new_point)

    # Create future DataFrames for both temp and new_point (next n_pred samples, assuming 5-minute intervals)
    n_pred = int(length_of_missing_data / data_logging_interval) + 1
    future_temp = model_temp.make_future_dataframe(periods=n_pred, freq='5T')
    future_new_point = model_new_point.make_future_dataframe(periods=n_pred, freq='5T')

    # Predict the future values
    forecast_temp = model_temp.predict(future_temp)
    forecast_new_point = model_new_point.predict(future_new_point)

    # Calculate residuals
    residuals_temp = df_temp['y'] - forecast_temp['yhat'][:len(df_temp)]
    residuals_new_point = df_new_point['y'] - forecast_new_point['yhat'][:len(df_new_point)]

    # Compute RMSE for temp
    actual_temp = df_temp['y'].values
    predicted_temp = forecast_temp['yhat'].values[-n_pred:]
    rmse_temp = np.sqrt(mean_squared_error(actual_temp[-n_pred:], predicted_temp))
    print(f"RMSE for temp: {rmse_temp}")

    # Compute RMSE for new_point
    actual_new_point = df_new_point['y'].values
    predicted_new_point = forecast_new_point['yhat'].values[-n_pred:]
    rmse_new_point = np.sqrt(mean_squared_error(actual_new_point[-n_pred:], predicted_new_point))
    print(f"RMSE for new_point: {rmse_new_point}")

# Example usage
dqStart = pd.Timestamp('2023-03-12 01:05:00+04:00')
fprophet(df, length_of_missing_data, data_logging_interval, dqStart)


  pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")


KeyError: 'ts'