In [30]:
import pandas as pd

from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

In [31]:
data = pd.read_csv('df_treated.csv')
data.head()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,2021-12-01,01:00:00,3397.39,No Remarks
1,2021-12-01,02:00:00,3378.46,No Remarks
2,2021-12-01,03:00:00,3392.56,No Remarks
3,2021-12-01,04:00:00,3402.36,No Remarks
4,2021-12-01,05:00:00,3394.38,No Remarks


In [32]:
data.tail()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
21164,2024-04-30,20:00:00,3395.98,No Remarks
21165,2024-04-30,21:00:00,3459.08,No Remarks
21166,2024-04-30,22:00:00,3446.9,No Remarks
21167,2024-04-30,23:00:00,3388.89,No Remarks
21168,2024-04-30,00:00:00,3386.96,No Remarks


In [33]:
data.shape

(21169, 4)

In [34]:
data.isna().sum()

DATE                                0
TIME                                0
TREATED WATER PRODUCTION IN m3/h    0
REMARKS                             0
dtype: int64

In [35]:
data['DATETIME'] = data['DATE']+' '+ data['TIME']
data.head()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS,DATETIME
0,2021-12-01,01:00:00,3397.39,No Remarks,2021-12-01 01:00:00
1,2021-12-01,02:00:00,3378.46,No Remarks,2021-12-01 02:00:00
2,2021-12-01,03:00:00,3392.56,No Remarks,2021-12-01 03:00:00
3,2021-12-01,04:00:00,3402.36,No Remarks,2021-12-01 04:00:00
4,2021-12-01,05:00:00,3394.38,No Remarks,2021-12-01 05:00:00


In [36]:
data = data[['DATETIME','TREATED WATER PRODUCTION IN m3/h','REMARKS']]
data.head()

Unnamed: 0,DATETIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,2021-12-01 01:00:00,3397.39,No Remarks
1,2021-12-01 02:00:00,3378.46,No Remarks
2,2021-12-01 03:00:00,3392.56,No Remarks
3,2021-12-01 04:00:00,3402.36,No Remarks
4,2021-12-01 05:00:00,3394.38,No Remarks


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21169 entries, 0 to 21168
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   DATETIME                          21169 non-null  object 
 1   TREATED WATER PRODUCTION IN m3/h  21169 non-null  float64
 2   REMARKS                           21169 non-null  object 
dtypes: float64(1), object(2)
memory usage: 496.3+ KB


In [38]:
data['DATETIME'] = pd.to_datetime(data['DATETIME'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21169 entries, 0 to 21168
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   DATETIME                          21169 non-null  datetime64[ns]
 1   TREATED WATER PRODUCTION IN m3/h  21169 non-null  float64       
 2   REMARKS                           21169 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 496.3+ KB


In [39]:
data.set_index('DATETIME',inplace=True)
data.head()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-01 01:00:00,3397.39,No Remarks
2021-12-01 02:00:00,3378.46,No Remarks
2021-12-01 03:00:00,3392.56,No Remarks
2021-12-01 04:00:00,3402.36,No Remarks
2021-12-01 05:00:00,3394.38,No Remarks


In [40]:
data.sort_index(inplace=True)

In [41]:
import plotly.express as px

In [42]:
fig = px.line(data, y = 'TREATED WATER PRODUCTION IN m3/h', title='treated water')
fig

In [43]:
data[(data.index>='2021-12-05 00:00:00')&(data.index<'2021-12-07 00:00:00')]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-05 00:00:00,2295.08,No Remarks
2021-12-05 01:00:00,1149.61,No Remarks
2021-12-05 02:00:00,1152.71,No Remarks
2021-12-05 03:00:00,1137.21,No Remarks
2021-12-05 04:00:00,1145.48,No Remarks
2021-12-05 05:00:00,1154.77,No Remarks
2021-12-05 06:00:00,1157.87,No Remarks
2021-12-05 07:00:00,1138.25,No Remarks
2021-12-05 08:00:00,1155.8,No Remarks
2021-12-05 09:00:00,2305.41,No Remarks


In [44]:
data1 = data.resample('H')['TREATED WATER PRODUCTION IN m3/h'].sum()
data1 = data1.reset_index()
data1 = data1.set_index('DATETIME')
data1.head()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2021-12-01 01:00:00,3397.39
2021-12-01 02:00:00,3378.46
2021-12-01 03:00:00,3392.56
2021-12-01 04:00:00,3402.36
2021-12-01 05:00:00,3394.38


In [45]:
data1.shape

(21167, 1)

In [46]:
data1.tail()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2024-04-30 19:00:00,3415.76
2024-04-30 20:00:00,3395.98
2024-04-30 21:00:00,3459.08
2024-04-30 22:00:00,3446.9
2024-04-30 23:00:00,3388.89


In [47]:
data1.isna().sum()

TREATED WATER PRODUCTION IN m3/h    0
dtype: int64

In [48]:
data_ind = data.index
data1_ind = data1.index

In [49]:
print(len(data_ind))
print(len(data1_ind))

21169
21167


In [50]:
data_ind

DatetimeIndex(['2021-12-01 01:00:00', '2021-12-01 02:00:00',
               '2021-12-01 03:00:00', '2021-12-01 04:00:00',
               '2021-12-01 05:00:00', '2021-12-01 06:00:00',
               '2021-12-01 07:00:00', '2021-12-01 08:00:00',
               '2021-12-01 09:00:00', '2021-12-01 10:00:00',
               ...
               '2024-04-30 14:00:00', '2024-04-30 15:00:00',
               '2024-04-30 16:00:00', '2024-04-30 17:00:00',
               '2024-04-30 18:00:00', '2024-04-30 19:00:00',
               '2024-04-30 20:00:00', '2024-04-30 21:00:00',
               '2024-04-30 22:00:00', '2024-04-30 23:00:00'],
              dtype='datetime64[ns]', name='DATETIME', length=21169, freq=None)

In [51]:
data1_ind

DatetimeIndex(['2021-12-01 01:00:00', '2021-12-01 02:00:00',
               '2021-12-01 03:00:00', '2021-12-01 04:00:00',
               '2021-12-01 05:00:00', '2021-12-01 06:00:00',
               '2021-12-01 07:00:00', '2021-12-01 08:00:00',
               '2021-12-01 09:00:00', '2021-12-01 10:00:00',
               ...
               '2024-04-30 14:00:00', '2024-04-30 15:00:00',
               '2024-04-30 16:00:00', '2024-04-30 17:00:00',
               '2024-04-30 18:00:00', '2024-04-30 19:00:00',
               '2024-04-30 20:00:00', '2024-04-30 21:00:00',
               '2024-04-30 22:00:00', '2024-04-30 23:00:00'],
              dtype='datetime64[ns]', name='DATETIME', length=21167, freq=None)

In [52]:
data_ind.difference(data1_ind)

DatetimeIndex([], dtype='datetime64[ns]', name='DATETIME', freq=None)

In [53]:
missing_indices = data1[~data1.index.isin(data.index)]
missing_indices

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2022-02-06 02:00:00,0.0
2022-02-06 03:00:00,0.0
2022-03-15 00:00:00,0.0
2022-04-15 00:00:00,0.0
2022-05-15 00:00:00,0.0
...,...
2023-11-09 20:00:00,0.0
2023-11-09 21:00:00,0.0
2023-11-09 22:00:00,0.0
2023-11-09 23:00:00,0.0


In [54]:
data1[data1.index=='2023-11-09 20:00:00']

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2023-11-09 20:00:00,0.0


In [55]:
data[data.index=='2023-11-09 20:00:00']

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1


In [56]:
data[~data.index.isin(data1.index)]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1


In [57]:
data[data.index>='2024-04-30 00:00:00']

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-04-30 00:00:00,3386.96,No Remarks
2024-04-30 01:00:00,3331.07,No Remarks
2024-04-30 02:00:00,3345.15,No Remarks
2024-04-30 03:00:00,3339.39,No Remarks
2024-04-30 04:00:00,3351.92,No Remarks
2024-04-30 05:00:00,3358.96,No Remarks
2024-04-30 06:00:00,3380.25,No Remarks
2024-04-30 07:00:00,3349.87,No Remarks
2024-04-30 08:00:00,3337.14,No Remarks
2024-04-30 09:00:00,3375.74,No Remarks


In [29]:
class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol], start_p=0, start_q=0,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
#         if len(self.df[valcol]) > 70:
        train = self.df[valcol][:len(self.df[valcol])-28]
        test = self.df[valcol][len(self.df[valcol])-28:len(self.df[valcol])-16]
        val = self.df[valcol][len(self.df[valcol])-16:]
#         else:
#             train = self.df[valcol][:len(self.df[valcol])-8]
#             test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
#             val = self.df[valcol][len(self.df[valcol])-4:]
        start = len(train)
        end = len(train)+len(test)-1
#             print('train : {}'.format(train))
#             print('test : {}'.format(test))
        print('start : {}'.format(start))
        print('end : {}'.format(end))
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
#         return start, len(test)
        model_results = {"Model":"ARIMA","Stationary": c1.adf_test(valcol),"X_train": str(len(train))+" Weeks", "X_test": str(len(test))+" Weeks", "X_validation": str(len(val))+" Weeks", "ARIMA_order": c1.determine_ARIMA_order(valcol), "MSE": error1,"RMSE": error2,"MAPE":error3, "Accuracy":((1-error3)*100).round(0)}
#         with open('TSA_AQI_model_result_new.json','a') as f:
#             f.write(str(model_results)+',')
#             f.close()
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol)).fit()
#         if len(self.df[valcol]) > 70:
        fcast = results.predict(len(self.df), len(self.df)+11,typ='levels').round(2)
#         else:
#             fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         ax = self.df[valcol].plot(legend=True,figsize=(12,6))
#         fcast.plot(legend=True)
        print(fcast)
#         error_rmse = rmse(self.df[valcol],fcast[0:len(self.df)])
#         error_mse = mean_squared_error(self.df[valcol],fcast[0:len(self.df)])
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
#         DF = DF.reset_index()
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()
#         DF_fcast = DF_fcast.rename(columns={'predicted_mean':'Predicted', 'index':'DateTime'})
        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        
#         final_DF.to_json('/Users/syaminiv/Library/CloudStorage/OneDrive-BayesianWaysLLP/Documents/SiAP_ML_Application/Final_Notebooks/June2023/JSON/TSA_withval_Zone_{}.json'.format(item),orient='records')

c1 = TSA(data)
c1.adf_test("TREATED WATER PRODUCTION IN m3/h")
c1.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c1.fit_model("TREATED WATER PRODUCTION IN m3/h")
c1.full_data_model("TREATED WATER PRODUCTION IN m3/h")

ADF test statistic     -1.337151e+01
p-value                 5.186679e-25
# lags used             4.600000e+01
# observations          2.112200e+04
critical value (1%)    -3.430660e+00
critical value (5%)    -2.861677e+00
critical value (10%)   -2.566843e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
The best order is (3, 1, 1)
start : 21141
end : 21152
The best order is (3, 1, 1)



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.



MSE Error: 5829.258424
RMSE Error: 76.34958038
MAPE Error: 0.02206183022
Accuracy: 97.79381698
ADF test statistic     -1.337151e+01
p-value                 5.186679e-25
# lags used             4.600000e+01
# observations          2.112200e+04
critical value (1%)    -3.430660e+00
critical value (5%)    -2.861677e+00
critical value (10%)   -2.566843e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
The best order is (3, 1, 1)
The best order is (3, 1, 1)



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


Unknown keyword arguments: dict_keys(['typ']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.



21169    3420.10
21170    3418.29
21171    3417.46
21172    3418.94
21173    3418.95
21174    3419.02
21175    3419.12
21176    3419.13
21177    3419.15
21178    3419.15
21179    3419.16
21180    3419.16
Name: predicted_mean, dtype: float64
start : 21141
end : 21152
The best order is (3, 1, 1)



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.



MSE Error: 5829.258424
RMSE Error: 76.34958038
MAPE Error: 0.02206183022
Accuracy: 97.79381698
ADF test statistic     -1.337151e+01
p-value                 5.186679e-25
# lags used             4.600000e+01
# observations          2.112200e+04
critical value (1%)    -3.430660e+00
critical value (5%)    -2.861677e+00
critical value (10%)   -2.566843e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
The best order is (3, 1, 1)
     Date   Validation
0   21153  3433.473135
1   21154  3433.477401
2   21155  3433.479561
3   21156  3433.480644
4   21157  3433.481187
5   21158  3433.481460
6   21159  3433.481597
7   21160  3433.481666
8   21161  3433.481701
9   21162  3433.481718
10  21163  3433.481727
11  21164  3433.481731
12  21165  3433.481733
13  21166  3433.481734
14  21167  3433.481735
15  21168  3433.481735
                      Date  TREATED WATER PRODUCTION IN m3/h       Type
0      2021-12-01 01:00:00                  

In [30]:
c1 = TSA(data1)
c1.adf_test("TREATED WATER PRODUCTION IN m3/h")
c1.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c1.fit_model("TREATED WATER PRODUCTION IN m3/h")
c1.full_data_model("TREATED WATER PRODUCTION IN m3/h")

ADF test statistic     -1.849374e+01
p-value                 2.125087e-30
# lags used             2.600000e+01
# observations          2.114000e+04
critical value (1%)    -3.430659e+00
critical value (5%)    -2.861677e+00
critical value (10%)   -2.566843e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
The best order is (3, 1, 1)
start : 21139
end : 21150
The best order is (3, 1, 1)



No frequency information was provided, so inferred frequency H will be used.


No frequency information was provided, so inferred frequency H will be used.


No frequency information was provided, so inferred frequency H will be used.



MSE Error: 5917.575328
RMSE Error: 76.92577805
MAPE Error: 0.02222529503
Accuracy:  97.7774705
ADF test statistic     -1.849374e+01
p-value                 2.125087e-30
# lags used             2.600000e+01
# observations          2.114000e+04
critical value (1%)    -3.430659e+00
critical value (5%)    -2.861677e+00
critical value (10%)   -2.566843e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
The best order is (3, 1, 1)
The best order is (3, 1, 1)



No frequency information was provided, so inferred frequency H will be used.


No frequency information was provided, so inferred frequency H will be used.


No frequency information was provided, so inferred frequency H will be used.


Unknown keyword arguments: dict_keys(['typ']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.



2024-05-01 00:00:00    3421.46
2024-05-01 01:00:00    3419.62
2024-05-01 02:00:00    3418.59
2024-05-01 03:00:00    3420.40
2024-05-01 04:00:00    3420.42
2024-05-01 05:00:00    3420.52
2024-05-01 06:00:00    3420.66
2024-05-01 07:00:00    3420.68
2024-05-01 08:00:00    3420.71
2024-05-01 09:00:00    3420.72
2024-05-01 10:00:00    3420.73
2024-05-01 11:00:00    3420.73
Freq: H, Name: predicted_mean, dtype: float64
start : 21139
end : 21150
The best order is (3, 1, 1)



No frequency information was provided, so inferred frequency H will be used.


No frequency information was provided, so inferred frequency H will be used.


No frequency information was provided, so inferred frequency H will be used.



MSE Error: 5917.575328
RMSE Error: 76.92577805
MAPE Error: 0.02222529503
Accuracy:  97.7774705
ADF test statistic     -1.849374e+01
p-value                 2.125087e-30
# lags used             2.600000e+01
# observations          2.114000e+04
critical value (1%)    -3.430659e+00
critical value (5%)    -2.861677e+00
critical value (10%)   -2.566843e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
The best order is (3, 1, 1)
                  Date   Validation
0  2024-04-30 08:00:00  3434.990279
1  2024-04-30 09:00:00  3434.999068
2  2024-04-30 10:00:00  3435.003775
3  2024-04-30 11:00:00  3435.006275
4  2024-04-30 12:00:00  3435.007602
5  2024-04-30 13:00:00  3435.008309
6  2024-04-30 14:00:00  3435.008685
7  2024-04-30 15:00:00  3435.008884
8  2024-04-30 16:00:00  3435.008991
9  2024-04-30 17:00:00  3435.009047
10 2024-04-30 18:00:00  3435.009077
11 2024-04-30 19:00:00  3435.009093
12 2024-04-30 20:00:00  3435.009102
13 

### TSA class with seasonality

In [None]:
class TSA_seasonal:
    def __init__(self, df):
        self.df = df

    def adf_test(self, valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(), autolag='AIC')  # .dropna() handles differenced data

        labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
        out = pd.Series(result[0:4], index=labels)

        for key, val in result[4].items():
            out[f'critical value ({key})'] = val

        if result[1] <= 0.05:
            state = "Stationary"
        else:
            state = "Non-stationary"
        return state

    def determine_ARIMA_order(self, valcol):
        stepwise_fit = auto_arima(self.df[valcol], seasonal=True, m=24,
                                  start_p=0, start_q=0,
                                  error_action='ignore',  # we don't want to know if an order does not work
                                  suppress_warnings=True,  # we don't want convergence warnings
                                  stepwise=True)  # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order

    def fit_model(self, valcol):
        train = self.df[valcol][:len(self.df[valcol]) - 28]
        test = self.df[valcol][len(self.df[valcol]) - 28:len(self.df[valcol]) - 16]
        val = self.df[valcol][len(self.df[valcol]) - 16:]

        results = ARIMA(train, order=self.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=len(train), end=len(train) + len(test) - 1)
        predictions_val = results.predict(start=len(train) + len(test), end=len(train) + len(test) + len(val) - 1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        
        return predictions_val

    def full_data_model(self, valcol):
        results = ARIMA(self.df[valcol], order=self.determine_ARIMA_order(valcol)).fit()
        fcast = results.forecast(steps=12).round(2)  # Forecast 4 steps ahead
#         print(fcast)
#         fcast_index = pd.date_range(start=self.df.index[-1], periods=4 + 1, freq='M')[1:]  # Assuming monthly data
        DF_fcast = pd.DataFrame({valcol: fcast})
        DF_fcast['Type'] = 'Predicted'
        print(DF_fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        # Concatenate original data and forecast data
        combined_DF = pd.concat([DF, DF_fcast])
        combined_DF = combined_DF.reset_index().rename(columns={'index':'Date'})
        print(combined_DF)
        DF_val = pd.DataFrame(c2.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
#         combined_DF = combined_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(combined_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  combined_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
#         print(combined_df)
        return final_DF
    
    
c2 = TSA_seasonal(data1)
c2.adf_test("TREATED WATER PRODUCTION IN m3/h")
c2.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c2.fit_model("TREATED WATER PRODUCTION IN m3/h")
c2.full_data_model("TREATED WATER PRODUCTION IN m3/h")

In [59]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
class TSA_seasonal:
    def __init__(self, df):
        self.df = df

    def adf_test(self, valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(), autolag='AIC')  # .dropna() handles differenced data

        labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
        out = pd.Series(result[0:4], index=labels)
       

        for key, val in result[4].items():
            out[f'critical value ({key})'] = val

        if result[1] <= 0.05:
            state = "Stationary"
        else:
            state = "Non-stationary"
        return state
    
#     def find_D(self, valcol):
#         # Estimate number of seasonal differences using a Canova-Hansen test
#         D = nsdiffs(self.df[valcol], m=12, test='ch')
#         return D
    

    def determine_SARIMA_order(self, valcol):
#         D = self.find_D(valcol)
        stepwise_fit = auto_arima(self.df[valcol], seasonal=True, m=24,
                               start_p=0, start_q=0,
#                               start_P=0, start_Q=0,
#                                   D=D
                                  error_action='ignore',  # we don't want to know if an order does not work
                                  suppress_warnings=True,  # we don't want convergence warnings
                                  stepwise=True)  # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        best_seasonal_order = stepwise_fit.get_params().get('seasonal_order')
        print('The best seasonal order is {}'.format(best_seasonal_order))
        print('The best order is {}'.format(best_order))
        return best_order, best_seasonal_order


    def fit_model(self, valcol):
    # Split the data into train, test, and validation sets
        train = self.df[valcol][:len(self.df[valcol]) - 28]
        test = self.df[valcol][len(self.df[valcol]) - 28:len(self.df[valcol]) - 16]
        val = self.df[valcol][len(self.df[valcol]) - 16:]

        # Determine the best SARIMA order
        best_order, best_seasonal_order = self.determine_SARIMA_order(valcol)

        # Fit the SARIMA model on the training data
        model = SARIMAX(train, order=best_order, seasonal_order=best_seasonal_order)
        results = model.fit()

        # Generate predictions for the test set
        predictions = results.predict(start=len(train), end=len(train) + len(test) - 1)

        # Generate predictions for the validation set
        predictions_val = results.predict(start=len(train) + len(test), end=len(train) + len(test) + len(val) - 1)

        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        return predictions_val

    def full_data_model(self, valcol):
        best_order, best_seasonal_order = self.determine_SARIMA_order(valcol)
        model = SARIMAX(self.df[valcol], order=best_order, seasonal_order=best_seasonal_order)
        results = model.fit()
        fcast = results.forecast(steps=12).round(2)  # Forecast 4 steps ahead
#         print(fcast)
#         fcast_index = pd.date_range(start=self.df.index[-1], periods=4 + 1, freq='M')[1:]  # Assuming monthly data
        DF_fcast = pd.DataFrame({valcol: fcast})
        DF_fcast['Type'] = 'Predicted'
        print(DF_fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        # Concatenate original data and forecast data
        combined_DF = pd.concat([DF, DF_fcast])
        combined_DF = combined_DF.reset_index().rename(columns={'index':'Date'})
        print(combined_DF)
        DF_val = pd.DataFrame(c2.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
#         combined_DF = combined_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(combined_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  combined_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
#         print(combined_df)
        return final_DF
    
    
c2 = TSA_seasonal(data1)
c2.adf_test("TREATED WATER PRODUCTION IN m3/h")
# c2.find_D('TREATED WATER PRODUCTION IN m3/h')
c2.determine_SARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c2.fit_model("TREATED WATER PRODUCTION IN m3/h")
c2.full_data_model("TREATED WATER PRODUCTION IN m3/h")

KeyboardInterrupt: 