In [22]:
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
from data_extraction.dummy_data_extractor import extract_dummy_data

def prophet_forecasting(df, length_of_missing_data, data_logging_interval, dqStart):
    """
    Function to perform time series forecasting using Prophet.

    Parameters:
    - df: DataFrame, input data with columns ['ts', 'temp', 'new_point']
    - length_of_missing_data: int, length of missing data
    - data_logging_interval: int, data logging interval
    - dqStart: not used in the function but kept for consistency with the original signature

    Returns:
    - df: DataFrame with predictions for all rows with missing columns. Index names as ts.
    """
    df = extract_dummy_data("dummy_data")
    df = df.at[0, "his"]
    mt = df.set_index(["ts"])

    # Tag and filter rows with missing
    mt["status"] = mt.isna().any(axis=1)
    mt_predict = mt[mt["status"] == 1]
    X_predict = mt_predict.iloc[:, 0:1]

    # Filtered master table
    mt_train = mt.dropna()

    # Separate data for temperature
    df_temp = mt_train[['temp']].reset_index().rename(columns={'ts': 'ds', 'temp': 'y'})

    # Ensure 'ds' column is timezone-naive
    df_temp['ds'] = df_temp['ds'].dt.tz_localize(None)

    # Calculate periods based on length_of_missing_data and data_logging_interval
    periods = int(length_of_missing_data / data_logging_interval) + 1

    # Split the data into training and testing sets
    train_temp, test_temp = train_test_split(df_temp, test_size=0.2, shuffle=False)

    # Initialize Prophet model with tuned hyperparameters
    model_temp = Prophet(seasonality_mode='additive', interval_width=0.95, changepoint_prior_scale=0.00001)

    # Fit the model
    model_temp.fit(train_temp)

    # Create future DataFrame for temp
    future_temp = model_temp.make_future_dataframe(periods=periods, freq=f'{data_logging_interval}T')

    # Predict the future values for temp
    forecast_temp = model_temp.predict(future_temp)

    # Compute RMSE for temp
    predicted_temp = forecast_temp['yhat'].values[-len(test_temp):]
    rmse_temp = np.sqrt(mean_squared_error(test_temp['y'], predicted_temp))

    # Print RMSE value for temp
    print(f"RMSE for temp: {rmse_temp}")

    # Making predictions on the same data or new data
    X_predict = X_predict.dropna()  # Remove rows with NaN values in 'ds'
    predict_temp = model_temp.predict(X_predict.rename(columns={X_predict.columns[0]: 'ds'}))

    df_temp_pred = pd.DataFrame(data=predict_temp['yhat'].values, index=X_predict.index, columns=['temp_pred'])

    df = df_temp_pred

    # Print the head of the resulting DataFrame
    print(df.head())

    return df

# Sample data creation
data = {
    'ts': pd.date_range(start='2023-01-01', periods=100, freq='D'),
    'temp': np.random.randn(100),
    'new_point': np.random.randn(100)
}
df = pd.DataFrame(data)

# Introducing some NaNs to simulate missing data
df.loc[10:20, 'temp'] = np.nan

# Converting to the expected input format
df['his'] = df.apply(lambda x: pd.DataFrame([{'ts': x['ts'], 'temp': x['temp'], 'new_point': x['new_point']}]), axis=1)

# Define the values for length_of_missing_data and data_logging_interval
length_of_missing_data = 400  # example value
data_logging_interval = 1  # example value

# Call the forecasting function with these parameters
forecast_df = prophet_forecasting(pd.DataFrame([{'his': df}]), length_of_missing_data, data_logging_interval, dqStart=None)

# Check the results
print(forecast_df.head())


09:35:09 - cmdstanpy - INFO - Chain [1] start processing
09:35:09 - cmdstanpy - INFO - Chain [1] done processing


RMSE for temp: 1.3849935377640978


ValueError: Dataframe has no rows.