In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm

In [21]:
# Read the data
df = pd.read_csv("final_data_in_ML.csv", parse_dates=['Standardized_Date'])
df

Unnamed: 0,Standardized_Date,STANDARDIZED_TIME,RAW WATER FLOW IN ML,CLEAR WATER SUMP LEVEL IN Meter,CLEAR WATER PUMPING FLOW ML,TREATED WATER PRODUCTION IN ML,REMARKS,remarks category
0,2021-12-01,00:00:00,3.52301,2.10,2.70209,3.40957,No remarks,No remarks
1,2021-12-01,01:00:00,3.51043,2.84,2.77743,3.39739,No remarks,No remarks
2,2021-12-01,02:00:00,3.49087,3.12,2.79467,3.37846,No remarks,No remarks
3,2021-12-01,03:00:00,3.50543,3.44,2.79543,3.39256,No remarks,No remarks
4,2021-12-01,04:00:00,3.51556,3.72,2.79826,3.40236,No remarks,No remarks
...,...,...,...,...,...,...,...,...
24043,2024-08-28,19:00:00,3.13486,3.46,2.87175,3.03392,No remarks,No remarks
24044,2024-08-28,20:00:00,3.17401,3.63,2.85830,3.07181,No remarks,No remarks
24045,2024-08-28,21:00:00,3.13113,3.64,2.87098,3.03031,No remarks,No remarks
24046,2024-08-28,22:00:00,3.16261,3.49,2.89001,3.06077,No remarks,No remarks


In [22]:
# Combine 'Standardized_Date' and 'STANDARDIZED_TIME' into a 'DATETIME' column
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))


In [23]:
# Filter necessary columns
df = df[['DATETIME', 'CLEAR WATER PUMPING FLOW ML', 'remarks category']]
df.set_index('DATETIME', inplace=True)
df

Unnamed: 0_level_0,CLEAR WATER PUMPING FLOW ML,remarks category
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-01 00:00:00,2.70209,No remarks
2021-12-01 01:00:00,2.77743,No remarks
2021-12-01 02:00:00,2.79467,No remarks
2021-12-01 03:00:00,2.79543,No remarks
2021-12-01 04:00:00,2.79826,No remarks
...,...,...
2024-08-28 19:00:00,2.87175,No remarks
2024-08-28 20:00:00,2.85830,No remarks
2024-08-28 21:00:00,2.87098,No remarks
2024-08-28 22:00:00,2.89001,No remarks


In [24]:
# Preprocess the 'REMARKS' column using One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
remarks_encoded = encoder.fit_transform(df[['remarks category']])

# Create a DataFrame for the encoded remarks
remarks_encoded_df = pd.DataFrame(remarks_encoded, index=df.index, columns=encoder.get_feature_names_out(['remarks category']))
remarks_encoded_df

Unnamed: 0_level_0,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-12-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 03:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 04:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-28 19:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-28 20:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-28 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-28 22:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Concatenate the encoded remarks back to the original dataframe
df = pd.concat([df['CLEAR WATER PUMPING FLOW ML'], remarks_encoded_df], axis=1)
df

Unnamed: 0_level_0,CLEAR WATER PUMPING FLOW ML,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-12-01 00:00:00,2.70209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 01:00:00,2.77743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 02:00:00,2.79467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 03:00:00,2.79543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-01 04:00:00,2.79826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-28 19:00:00,2.87175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-28 20:00:00,2.85830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-28 21:00:00,2.87098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-28 22:00:00,2.89001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_daily = df.resample('D').sum(numeric_only=True)
df_daily

Unnamed: 0_level_0,CLEAR WATER PUMPING FLOW ML,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-12-01,42.40192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-02,62.91389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-03,43.41464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-04,40.06170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-05,46.87842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-24,68.91871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-25,64.06335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-26,66.58816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-27,68.81720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

class TSA_Multi:
    def __init__(self, df):
        self.df = df

    def adf_test(self, col_list):
        for col in col_list:
            print(f"ADF Test for {col}")
            result = adfuller(self.df[col].dropna(), autolag='AIC')
            labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
            out = pd.Series(result[0:4], index=labels)

            for key, val in result[4].items():
                out[f'critical value ({key})'] = val

            print(out.to_string())

            if result[1] <= 0.05:
                print("Strong evidence against the null hypothesis")
                print("Reject the null hypothesis")
                print("Data has no unit root and is stationary")
            else:
                print("Weak evidence against the null hypothesis")
                print("Fail to reject the null hypothesis")
                print("Data has a unit root and is non-stationary")
            print()

    def fit_model(self, valcol_list):
        train = self.df.iloc[:-6, :]
        test = self.df.iloc[-6:, :]

        print(f"Training data: {train.shape}, Test data: {test.shape}")

        # Fit a VARMAX model
        model = VARMAX(train[valcol_list], order=(2, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast
        forecast = fitted_model.forecast(steps=len(test))
        print(f"Forecast results:\n{forecast}")

        # Evaluate model
        for col in valcol_list:
            mse = mean_squared_error(test[col], forecast[col])
            rmse_error = rmse(test[col], forecast[col])
            mape = mean_absolute_percentage_error(test[col], forecast[col])
            accuracy = (1 - mape) * 100

            print(f"\nMetrics for {col}:")
            print(f'MSE: {mse}')
            print(f'RMSE: {rmse_error}')
            print(f'MAPE: {mape}')
            print(f'Accuracy: {accuracy}%')

        return forecast

    def full_data_model(self, valcol_list):
        model = VARMAX(self.df[valcol_list], order=(2, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast the next 3 periods
        forecast = fitted_model.forecast(steps=3)
        print(f"Full Data Forecast results:\n{forecast}")

        # Create final DataFrame
        final_DF = pd.concat([self.df[valcol_list], forecast], axis=0)
        final_DF['Type'] = ['Actual'] * len(self.df) + ['Predicted'] * len(forecast)
        
        print(final_DF.tail(10))
        return final_DF

# Initialize with the DataFrame containing both CLEAR WATER PUMPING FLOW ML and encoded remarks columns
c2 = TSA_Multi(df_daily)

# Perform ADF test for all columns (Clear Water and remarks)
columns_to_forecast = ['CLEAR WATER PUMPING FLOW ML'] + list(remarks_encoded_df.columns)
c2.adf_test(columns_to_forecast)

# Fit and forecast the model
forecast_results = c2.fit_model(columns_to_forecast)

# Forecast for the full dataset and future periods
c2.full_data_model(columns_to_forecast)


ADF Test for CLEAR WATER PUMPING FLOW ML
ADF test statistic       -2.809849
p-value                   0.056881
# lags used              20.000000
# observations          981.000000
critical value (1%)      -3.437033
critical value (5%)      -2.864491
critical value (10%)     -2.568341
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary

ADF Test for remarks category_Power Failure at Intake
ADF test statistic       -29.097421
p-value                    0.000000
# lags used                0.000000
# observations          1001.000000
critical value (1%)       -3.436900
critical value (5%)       -2.864432
critical value (10%)      -2.568310
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary

ADF Test for remarks category_Power Failure at WTP
ADF test statistic       -31.728733
p-value                    0.000000
# lags used                0.000000
# observations  

  warn('Estimation of VARMA(p,q) models is not generically robust,'


Forecast results:
            CLEAR WATER PUMPING FLOW ML  \
2024-08-23                    65.103252   
2024-08-24                    64.911084   
2024-08-25                    64.341903   
2024-08-26                    64.079124   
2024-08-27                    63.904566   
2024-08-28                    63.787833   

            remarks category_Power Failure at Intake  \
2024-08-23                                  0.121984   
2024-08-24                                  0.117560   
2024-08-25                                  0.123049   
2024-08-26                                  0.118558   
2024-08-27                                  0.119246   
2024-08-28                                  0.119203   

            remarks category_Power Failure at WTP  \
2024-08-23                               0.021414   
2024-08-24                               0.013944   
2024-08-25                               0.020532   
2024-08-26                               0.022463   
2024-08-27            

  warn('Estimation of VARMA(p,q) models is not generically robust,'


Full Data Forecast results:
            CLEAR WATER PUMPING FLOW ML  \
2024-08-29                    65.017312   
2024-08-30                    64.179169   
2024-08-31                    63.905511   

            remarks category_Power Failure at Intake  \
2024-08-29                                  0.095978   
2024-08-30                                  0.125268   
2024-08-31                                  0.117463   

            remarks category_Power Failure at WTP  \
2024-08-29                               0.015044   
2024-08-30                               0.016043   
2024-08-31                               0.020456   

            remarks category_Power Failure at intake and WTP  \
2024-08-29                                          0.042348   
2024-08-30                                          0.047466   
2024-08-31                                          0.050233   

            remarks category_WTP cleaning  remarks category_channel cieaning  \
2024-08-29              

Unnamed: 0,CLEAR WATER PUMPING FLOW ML,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake,Type
2021-12-01,42.401920,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-02,62.913890,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-03,43.414640,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-04,40.061700,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-05,46.878420,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-27,68.817200,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2024-08-28,65.003090,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2024-08-29,65.017312,0.095978,0.015044,0.042348,0.023214,0.008005,0.029336,0.038754,0.000742,0.004052,0.077678,-0.004458,0.019634,0.058918,0.005208,0.012472,0.020848,0.008707,0.005853,Predicted
2024-08-30,64.179169,0.125268,0.016043,0.047466,0.047068,0.012198,0.045799,0.039460,0.000671,0.004983,0.054179,-0.001158,0.016392,0.085336,0.010372,0.018896,0.050697,0.006638,0.004085,Predicted


In [9]:
df_monthly = df.resample('M').sum(numeric_only=True)
df_monthly

Unnamed: 0_level_0,CLEAR WATER PUMPING FLOW ML,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-12-31,1738.985111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-31,1669.968356,11.0,1.0,0.0,49.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-28,1577.52211,2.0,11.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
2022-03-31,1860.290835,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-04-30,1699.23306,8.0,2.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,6.0,0.0,0.0,0.0
2022-05-31,1757.84088,11.0,7.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-06-30,1606.90324,13.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,0.0,0.0
2022-07-31,1404.36223,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-08-31,1520.86568,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0
2022-09-30,1669.90655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

class TSA_Multi:
    def __init__(self, df):
        self.df = df

    def adf_test(self, col_list):
        for col in col_list:
            print(f"ADF Test for {col}")
            result = adfuller(self.df[col].dropna(), autolag='AIC')
            labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
            out = pd.Series(result[0:4], index=labels)

            for key, val in result[4].items():
                out[f'critical value ({key})'] = val

            print(out.to_string())

            if result[1] <= 0.05:
                print("Strong evidence against the null hypothesis")
                print("Reject the null hypothesis")
                print("Data has no unit root and is stationary")
            else:
                print("Weak evidence against the null hypothesis")
                print("Fail to reject the null hypothesis")
                print("Data has a unit root and is non-stationary")
            print()

    def fit_model(self, valcol_list):
        train = self.df.iloc[:-6, :]
        test = self.df.iloc[-6:, :]

        print(f"Training data: {train.shape}, Test data: {test.shape}")

        # Fit a VARMAX model
        model = VARMAX(train[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast
        forecast = fitted_model.forecast(steps=len(test))
        print(f"Forecast results:\n{forecast}")

        # Evaluate model
        for col in valcol_list:
            mse = mean_squared_error(test[col], forecast[col])
            rmse_error = rmse(test[col], forecast[col])
            mape = mean_absolute_percentage_error(test[col], forecast[col])
            accuracy = (1 - mape) * 100

            print(f"\nMetrics for {col}:")
            print(f'MSE: {mse}')
            print(f'RMSE: {rmse_error}')
            print(f'MAPE: {mape}')
            print(f'Accuracy: {accuracy}%')

        return forecast

    def full_data_model(self, valcol_list):
        model = VARMAX(self.df[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast the next 3 periods
        forecast = fitted_model.forecast(steps=3)
        print(f"Full Data Forecast results:\n{forecast}")

        # Create final DataFrame
        final_DF = pd.concat([self.df[valcol_list], forecast], axis=0)
        final_DF['Type'] = ['Actual'] * len(self.df) + ['Predicted'] * len(forecast)
        
        print(final_DF.tail(10))
        return final_DF

# Initialize with the DataFrame containing both CLEAR WATER PUMPING FLOW ML and encoded remarks columns
c2 = TSA_Multi(df_monthly)

# Perform ADF test for all columns (Clear Water and remarks)
columns_to_forecast = ['CLEAR WATER PUMPING FLOW ML'] + list(remarks_encoded_df.columns)
c2.adf_test(columns_to_forecast)

# Fit and forecast the model
forecast_results = c2.fit_model(columns_to_forecast)

# Forecast for the full dataset and future periods
c2.full_data_model(columns_to_forecast)


ADF Test for CLEAR WATER PUMPING FLOW ML
ADF test statistic      -2.102548
p-value                  0.243491
# lags used              0.000000
# observations          32.000000
critical value (1%)     -3.653520
critical value (5%)     -2.957219
critical value (10%)    -2.617588
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary

ADF Test for remarks category_Power Failure at Intake
ADF test statistic      -3.776272
p-value                  0.003160
# lags used              1.000000
# observations          31.000000
critical value (1%)     -3.661429
critical value (5%)     -2.960525
critical value (10%)    -2.619319
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary

ADF Test for remarks category_Power Failure at WTP
ADF test statistic      -4.494584
p-value                  0.000201
# lags used              0.000000
# observations          32.000000
critical 

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


LinAlgError: Matrix is not positive definite

In [11]:
df_weekly = df.resample('W').sum(numeric_only=True)
df_weekly

Unnamed: 0_level_0,CLEAR WATER PUMPING FLOW ML,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-12-05,235.670570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-12,407.530700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-19,395.666220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-12-26,407.155871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-02,403.914700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-04,349.983163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-11,411.578760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-18,438.489910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2024-08-25,415.131130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

class TSA_Multi:
    def __init__(self, df):
        self.df = df

    def adf_test(self, col_list):
        for col in col_list:
            print(f"ADF Test for {col}")
            result = adfuller(self.df[col].dropna(), autolag='AIC')
            labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
            out = pd.Series(result[0:4], index=labels)

            for key, val in result[4].items():
                out[f'critical value ({key})'] = val

            print(out.to_string())

            if result[1] <= 0.05:
                print("Strong evidence against the null hypothesis")
                print("Reject the null hypothesis")
                print("Data has no unit root and is stationary")
            else:
                print("Weak evidence against the null hypothesis")
                print("Fail to reject the null hypothesis")
                print("Data has a unit root and is non-stationary")
            print()

    def fit_model(self, valcol_list):
        train = self.df.iloc[:-6, :]
        test = self.df.iloc[-6:, :]

        print(f"Training data: {train.shape}, Test data: {test.shape}")

        # Fit a VARMAX model
        model = VARMAX(train[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast
        forecast = fitted_model.forecast(steps=len(test))
        print(f"Forecast results:\n{forecast}")

        # Evaluate model
        for col in valcol_list:
            mse = mean_squared_error(test[col], forecast[col])
            rmse_error = rmse(test[col], forecast[col])
            mape = mean_absolute_percentage_error(test[col], forecast[col])
            accuracy = (1 - mape) * 100

            print(f"\nMetrics for {col}:")
            print(f'MSE: {mse}')
            print(f'RMSE: {rmse_error}')
            print(f'MAPE: {mape}')
            print(f'Accuracy: {accuracy}%')

        return forecast

    def full_data_model(self, valcol_list):
        model = VARMAX(self.df[valcol_list], order=(0, 1))  # Adjust (p,q) as needed
        fitted_model = model.fit(disp=False)

        # Forecast the next 3 periods
        forecast = fitted_model.forecast(steps=3)
        print(f"Full Data Forecast results:\n{forecast}")

        # Create final DataFrame
        final_DF = pd.concat([self.df[valcol_list], forecast], axis=0)
        final_DF['Type'] = ['Actual'] * len(self.df) + ['Predicted'] * len(forecast)
        
        print(final_DF.tail(10))
        return final_DF

# Initialize with the DataFrame containing both CLEAR WATER PUMPING FLOW ML and encoded remarks columns
c2 = TSA_Multi(df_weekly)

# Perform ADF test for all columns (Clear Water and remarks)
columns_to_forecast = ['CLEAR WATER PUMPING FLOW ML'] + list(remarks_encoded_df.columns)
c2.adf_test(columns_to_forecast)

# Fit and forecast the model
forecast_results = c2.fit_model(columns_to_forecast)

# Forecast for the full dataset and future periods
c2.full_data_model(columns_to_forecast)


ADF Test for CLEAR WATER PUMPING FLOW ML
ADF test statistic       -1.919824
p-value                   0.322794
# lags used               3.000000
# observations          140.000000
critical value (1%)      -3.477945
critical value (5%)      -2.882416
critical value (10%)     -2.577902
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary

ADF Test for remarks category_Power Failure at Intake
ADF test statistic       -3.705163
p-value                   0.004041
# lags used               5.000000
# observations          138.000000
critical value (1%)      -3.478648
critical value (5%)      -2.882722
critical value (10%)     -2.578065
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary

ADF Test for remarks category_Power Failure at WTP
ADF test statistic       -3.899841
p-value                   0.002036
# lags used              11.000000
# observations          13

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


Forecast results:
            CLEAR WATER PUMPING FLOW ML  \
2024-07-28                   433.936372   
2024-08-04                   442.250708   
2024-08-11                   442.250708   
2024-08-18                   442.250708   
2024-08-25                   442.250708   
2024-09-01                   442.250708   

            remarks category_Power Failure at Intake  \
2024-07-28                                  0.270818   
2024-08-04                                  0.855070   
2024-08-11                                  0.855070   
2024-08-18                                  0.855070   
2024-08-25                                  0.855070   
2024-09-01                                  0.855070   

            remarks category_Power Failure at WTP  \
2024-07-28                               0.377942   
2024-08-04                               0.181157   
2024-08-11                               0.181157   
2024-08-18                               0.181157   
2024-08-25            



Full Data Forecast results:
            CLEAR WATER PUMPING FLOW ML  \
2024-09-08                   539.496185   
2024-09-15                   438.557242   
2024-09-22                   438.557242   

            remarks category_Power Failure at Intake  \
2024-09-08                                  2.154527   
2024-09-15                                  0.819446   
2024-09-22                                  0.819446   

            remarks category_Power Failure at WTP  \
2024-09-08                               0.499576   
2024-09-15                               0.173610   
2024-09-22                               0.173610   

            remarks category_Power Failure at intake and WTP  \
2024-09-08                                          1.450128   
2024-09-15                                          0.375000   
2024-09-22                                          0.375000   

            remarks category_WTP cleaning  remarks category_channel cieaning  \
2024-09-08              

Unnamed: 0,CLEAR WATER PUMPING FLOW ML,remarks category_Power Failure at Intake,remarks category_Power Failure at WTP,remarks category_Power Failure at intake and WTP,remarks category_WTP cleaning,remarks category_channel cieaning,remarks category_complaint at WTP,remarks category_distribution line maintanance,remarks category_energy auditing,remarks category_inlet chamber cleaning,remarks category_intake cleaning,remarks category_intake pumping stopped,remarks category_intake sump level low,remarks category_maintanance,remarks category_maintanance at Thaliparamba,remarks category_maintanance at intake,remarks category_pipe line broken at thaliparamba,remarks category_under voltage,remarks category_voltage fluctation at intake,Type
2021-12-05,235.670570,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-12,407.530700,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-19,395.666220,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2021-12-26,407.155871,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2022-01-02,403.914700,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-25,415.131130,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2024-09-01,200.408450,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Actual
2024-09-08,539.496185,2.154527,0.499576,1.450128,-21.789250,-0.099496,-1.291764,1.257413,0.139952,0.082261,-2.083707,0.847643,-0.062668,3.575382,0.866912,-0.151027,3.164479,-0.065496,0.022074,Predicted
2024-09-15,438.557242,0.819446,0.173610,0.375000,0.340284,0.069444,0.229167,0.243055,0.007808,0.027778,0.680561,0.090274,0.152778,0.500001,0.104180,0.152775,0.500001,0.111109,0.027784,Predicted
