In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm

In [None]:
#Loading dataset

In [None]:
# Read the data
df = pd.read_csv("final_data_in_ML.csv", parse_dates=['Standardized_Date'])
df

In [None]:
# Combine 'Standardized_Date' and 'STANDARDIZED_TIME' into a 'DATETIME' column
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))


In [None]:
# Filter necessary columns
df = df[['DATETIME', 'CLEAR WATER PUMPING FLOW ML', 'remarks category']]
df.set_index('DATETIME', inplace=True)
df

In [None]:
# Preprocess the 'REMARKS' column using One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
remarks_encoded = encoder.fit_transform(df[['remarks category']])

# Create a DataFrame for the encoded remarks
remarks_encoded_df = pd.DataFrame(remarks_encoded, index=df.index, columns=encoder.get_feature_names_out(['remarks category']))
remarks_encoded_df

In [None]:
# Concatenate the encoded remarks back to the original dataframe
df = pd.concat([df['CLEAR WATER PUMPING FLOW ML'], remarks_encoded_df], axis=1)
df

In [None]:
df_daily = df.resample('D').sum(numeric_only=True)
df_daily

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

class TSA_Multi:
    def __init__(self, df):
        self.df = df

    def adf_test(self, col_list):
        for col in col_list:
            print(f"ADF Test for {col}")
            result = adfuller(self.df[col].dropna(), autolag='AIC')
            labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
            out = pd.Series(result[0:4], index=labels)

            for key, val in result[4].items():
                out[f'critical value ({key})'] = val

            print(out.to_string())

            if result[1] <= 0.05:
                print("Strong evidence against the null hypothesis")
                print("Reject the null hypothesis")
                print("Data has no unit root and is stationary")
            else:
                print("Weak evidence against the null hypothesis")
                print("Fail to reject the null hypothesis")
                print("Data has a unit root and is non-stationary")
            print()

    def fit_model(self, valcol_list):
        train = self.df.iloc[:-6, :]
        test = self.df.iloc[-6:, :]

        print(f"Training data: {train.shape}, Test data: {test.shape}")

        # Fit a VARMAX model
        model = VARMAX(train[valcol_list], order=(2, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast
        forecast = fitted_model.forecast(steps=len(test))
        print(f"Forecast results:\n{forecast}")

        # Evaluate model
        for col in valcol_list:
            mse = mean_squared_error(test[col], forecast[col])
            rmse_error = rmse(test[col], forecast[col])
            mape = mean_absolute_percentage_error(test[col], forecast[col])
            accuracy = (1 - mape) * 100

            print(f"\nMetrics for {col}:")
            print(f'MSE: {mse}')
            print(f'RMSE: {rmse_error}')
            print(f'MAPE: {mape}')
            print(f'Accuracy: {accuracy}%')

        return forecast

    def full_data_model(self, valcol_list):
        model = VARMAX(self.df[valcol_list], order=(2, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast the next 3 periods
        forecast = fitted_model.forecast(steps=3)
        print(f"Full Data Forecast results:\n{forecast}")

        # Create final DataFrame
        final_DF = pd.concat([self.df[valcol_list], forecast], axis=0)
        final_DF['Type'] = ['Actual'] * len(self.df) + ['Predicted'] * len(forecast)
        
        print(final_DF.tail(10))
        return final_DF

# Initialize with the DataFrame containing both CLEAR WATER PUMPING FLOW ML and encoded remarks columns
c2 = TSA_Multi(df_daily)

# Perform ADF test for all columns (Clear Water and remarks)
columns_to_forecast = ['CLEAR WATER PUMPING FLOW ML'] + list(remarks_encoded_df.columns)
c2.adf_test(columns_to_forecast)

# Fit and forecast the model
forecast_results = c2.fit_model(columns_to_forecast)

# Forecast for the full dataset and future periods
c2.full_data_model(columns_to_forecast)


In [None]:
df_monthly = df.resample('M').sum(numeric_only=True)
df_monthly

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

class TSA_Multi:
    def __init__(self, df):
        self.df = df

    def adf_test(self, col_list):
        for col in col_list:
            print(f"ADF Test for {col}")
            result = adfuller(self.df[col].dropna(), autolag='AIC')
            labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
            out = pd.Series(result[0:4], index=labels)

            for key, val in result[4].items():
                out[f'critical value ({key})'] = val

            print(out.to_string())

            if result[1] <= 0.05:
                print("Strong evidence against the null hypothesis")
                print("Reject the null hypothesis")
                print("Data has no unit root and is stationary")
            else:
                print("Weak evidence against the null hypothesis")
                print("Fail to reject the null hypothesis")
                print("Data has a unit root and is non-stationary")
            print()

    def fit_model(self, valcol_list):
        train = self.df.iloc[:-6, :]
        test = self.df.iloc[-6:, :]

        print(f"Training data: {train.shape}, Test data: {test.shape}")

        # Fit a VARMAX model
        model = VARMAX(train[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast
        forecast = fitted_model.forecast(steps=len(test))
        print(f"Forecast results:\n{forecast}")

        # Evaluate model
        for col in valcol_list:
            mse = mean_squared_error(test[col], forecast[col])
            rmse_error = rmse(test[col], forecast[col])
            mape = mean_absolute_percentage_error(test[col], forecast[col])
            accuracy = (1 - mape) * 100

            print(f"\nMetrics for {col}:")
            print(f'MSE: {mse}')
            print(f'RMSE: {rmse_error}')
            print(f'MAPE: {mape}')
            print(f'Accuracy: {accuracy}%')

        return forecast

    def full_data_model(self, valcol_list):
        model = VARMAX(self.df[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast the next 3 periods
        forecast = fitted_model.forecast(steps=3)
        print(f"Full Data Forecast results:\n{forecast}")

        # Create final DataFrame
        final_DF = pd.concat([self.df[valcol_list], forecast], axis=0)
        final_DF['Type'] = ['Actual'] * len(self.df) + ['Predicted'] * len(forecast)
        
        print(final_DF.tail(10))
        return final_DF

# Initialize with the DataFrame containing both CLEAR WATER PUMPING FLOW ML and encoded remarks columns
c2 = TSA_Multi(df_monthly)

# Perform ADF test for all columns (Clear Water and remarks)
columns_to_forecast = ['CLEAR WATER PUMPING FLOW ML'] + list(remarks_encoded_df.columns)
c2.adf_test(columns_to_forecast)

# Fit and forecast the model
forecast_results = c2.fit_model(columns_to_forecast)

# Forecast for the full dataset and future periods
c2.full_data_model(columns_to_forecast)


In [None]:
df_weekly = df.resample('W').sum(numeric_only=True)
df_weekly

In [None]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

class TSA_Multi:
    def __init__(self, df):
        self.df = df

    def adf_test(self, col_list):
        for col in col_list:
            print(f"ADF Test for {col}")
            result = adfuller(self.df[col].dropna(), autolag='AIC')
            labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
            out = pd.Series(result[0:4], index=labels)

            for key, val in result[4].items():
                out[f'critical value ({key})'] = val

            print(out.to_string())

            if result[1] <= 0.05:
                print("Strong evidence against the null hypothesis")
                print("Reject the null hypothesis")
                print("Data has no unit root and is stationary")
            else:
                print("Weak evidence against the null hypothesis")
                print("Fail to reject the null hypothesis")
                print("Data has a unit root and is non-stationary")
            print()

    def fit_model(self, valcol_list):
        train = self.df.iloc[:-6, :]
        test = self.df.iloc[-6:, :]

        print(f"Training data: {train.shape}, Test data: {test.shape}")

        # Fit a VARMAX model
        model = VARMAX(train[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast
        forecast = fitted_model.forecast(steps=len(test))
        print(f"Forecast results:\n{forecast}")

        # Evaluate model
        for col in valcol_list:
            mse = mean_squared_error(test[col], forecast[col])
            rmse_error = rmse(test[col], forecast[col])
            mape = mean_absolute_percentage_error(test[col], forecast[col])
            accuracy = (1 - mape) * 100

            print(f"\nMetrics for {col}:")
            print(f'MSE: {mse}')
            print(f'RMSE: {rmse_error}')
            print(f'MAPE: {mape}')
            print(f'Accuracy: {accuracy}%')

        return forecast

    def full_data_model(self, valcol_list):
        model = VARMAX(self.df[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast the next 3 periods
        forecast = fitted_model.forecast(steps=3)
        print(f"Full Data Forecast results:\n{forecast}")

        # Create final DataFrame
        final_DF = pd.concat([self.df[valcol_list], forecast], axis=0)
        final_DF['Type'] = ['Actual'] * len(self.df) + ['Predicted'] * len(forecast)
        
        print(final_DF.tail(10))
        return final_DF

# Initialize with the DataFrame containing both CLEAR WATER PUMPING FLOW ML and encoded remarks columns
c2 = TSA_Multi(df_weekly)

# Perform ADF test for all columns (Clear Water and remarks)
columns_to_forecast = ['CLEAR WATER PUMPING FLOW ML'] + list(remarks_encoded_df.columns)
c2.adf_test(columns_to_forecast)

# Fit and forecast the model
forecast_results = c2.fit_model(columns_to_forecast)

# Forecast for the full dataset and future periods
c2.full_data_model(columns_to_forecast)
