In [1]:
import pandas as pd

from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

In [2]:
data = pd.read_csv('df_treated.csv')
data.head()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,2021-12-01,01:00:00,3397.39,No Remarks
1,2021-12-01,02:00:00,3378.46,No Remarks
2,2021-12-01,03:00:00,3392.56,No Remarks
3,2021-12-01,04:00:00,3402.36,No Remarks
4,2021-12-01,05:00:00,3394.38,No Remarks


In [3]:
data.shape

(21169, 4)

In [4]:
data.isna().sum()

DATE                                0
TIME                                0
TREATED WATER PRODUCTION IN m3/h    0
REMARKS                             0
dtype: int64

In [5]:
data['DATETIME'] = data['DATE']+' '+ data['TIME']
data.head()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS,DATETIME
0,2021-12-01,01:00:00,3397.39,No Remarks,2021-12-01 01:00:00
1,2021-12-01,02:00:00,3378.46,No Remarks,2021-12-01 02:00:00
2,2021-12-01,03:00:00,3392.56,No Remarks,2021-12-01 03:00:00
3,2021-12-01,04:00:00,3402.36,No Remarks,2021-12-01 04:00:00
4,2021-12-01,05:00:00,3394.38,No Remarks,2021-12-01 05:00:00


In [6]:
data = data[['DATETIME','TREATED WATER PRODUCTION IN m3/h','REMARKS']]
data.head()

Unnamed: 0,DATETIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,2021-12-01 01:00:00,3397.39,No Remarks
1,2021-12-01 02:00:00,3378.46,No Remarks
2,2021-12-01 03:00:00,3392.56,No Remarks
3,2021-12-01 04:00:00,3402.36,No Remarks
4,2021-12-01 05:00:00,3394.38,No Remarks


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21169 entries, 0 to 21168
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   DATETIME                          21169 non-null  object 
 1   TREATED WATER PRODUCTION IN m3/h  21169 non-null  float64
 2   REMARKS                           21169 non-null  object 
dtypes: float64(1), object(2)
memory usage: 496.3+ KB


In [8]:
data['DATETIME'] = pd.to_datetime(data['DATETIME'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21169 entries, 0 to 21168
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   DATETIME                          21169 non-null  datetime64[ns]
 1   TREATED WATER PRODUCTION IN m3/h  21169 non-null  float64       
 2   REMARKS                           21169 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 496.3+ KB


In [9]:
data.set_index('DATETIME',inplace=True)
data.head()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-01 01:00:00,3397.39,No Remarks
2021-12-01 02:00:00,3378.46,No Remarks
2021-12-01 03:00:00,3392.56,No Remarks
2021-12-01 04:00:00,3402.36,No Remarks
2021-12-01 05:00:00,3394.38,No Remarks


In [10]:
data_weekly = data.resample('W')['TREATED WATER PRODUCTION IN m3/h'].mean()
data_weekly = data_weekly.reset_index()
data_weekly = data_weekly.set_index('DATETIME')
data_weekly.head()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2021-12-05,2251.238
2021-12-12,2664.468036
2021-12-19,2599.487857
2021-12-26,2706.295833
2022-01-02,2639.030357


In [11]:
data_weekly[data_weekly['TREATED WATER PRODUCTION IN m3/h']==0.0]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1


In [12]:
data_weekly.shape

(127, 1)

In [13]:
class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol], start_p=0, start_q=0,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
#         if len(self.df[valcol]) > 70:
        train = self.df[valcol][:len(self.df[valcol])-8]
        test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
        val = self.df[valcol][len(self.df[valcol])-4:]
#         else:
#             train = self.df[valcol][:len(self.df[valcol])-8]
#             test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
#             val = self.df[valcol][len(self.df[valcol])-4:]
        start = len(train)
        end = len(train)+len(test)-1
#             print('train : {}'.format(train))
#             print('test : {}'.format(test))
        print('start : {}'.format(start))
        print('end : {}'.format(end))
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
#         return start, len(test)
        model_results = {"Model":"ARIMA","Stationary": c1.adf_test(valcol),"X_train": str(len(train))+" Weeks", "X_test": str(len(test))+" Weeks", "X_validation": str(len(val))+" Weeks", "ARIMA_order": c1.determine_ARIMA_order(valcol), "MSE": error1,"RMSE": error2,"MAPE":error3, "Accuracy":((1-error3)*100).round(0)}
#         with open('TSA_AQI_model_result_new.json','a') as f:
#             f.write(str(model_results)+',')
#             f.close()
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol)).fit()
#         if len(self.df[valcol]) > 70:
        fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         else:
#             fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         ax = self.df[valcol].plot(legend=True,figsize=(12,6))
#         fcast.plot(legend=True)
        print(fcast)
#         error_rmse = rmse(self.df[valcol],fcast[0:len(self.df)])
#         error_mse = mean_squared_error(self.df[valcol],fcast[0:len(self.df)])
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
#         DF = DF.reset_index()
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()
#         DF_fcast = DF_fcast.rename(columns={'predicted_mean':'Predicted', 'index':'DateTime'})
        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        
#         final_DF.to_json('/Users/syaminiv/Library/CloudStorage/OneDrive-BayesianWaysLLP/Documents/SiAP_ML_Application/Final_Notebooks/June2023/JSON/TSA_withval_Zone_{}.json'.format(item),orient='records')

c1 = TSA(data_weekly)
c1.adf_test("TREATED WATER PRODUCTION IN m3/h")
c1.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c1.fit_model("TREATED WATER PRODUCTION IN m3/h")
c1.full_data_model("TREATED WATER PRODUCTION IN m3/h")

ADF test statistic       -1.736958
p-value                   0.412178
# lags used               3.000000
# observations          123.000000
critical value (1%)      -3.484667
critical value (5%)      -2.885340
critical value (10%)     -2.579463
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
The best order is (0, 1, 1)
start : 119
end : 122
The best order is (0, 1, 1)
MSE Error: 50405.19237
RMSE Error: 224.5110072
MAPE Error: 0.04845771623
Accuracy: 95.15422838
ADF test statistic       -1.736958
p-value                   0.412178
# lags used               3.000000
# observations          123.000000
critical value (1%)      -3.484667
critical value (5%)      -2.885340
critical value (10%)     -2.579463
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


The best order is (0, 1, 1)
The best order is (0, 1, 1)
2024-05-12    3350.46
2024-05-19    3350.46
2024-05-26    3350.46
2024-06-02    3350.46
Freq: W-SUN, Name: predicted_mean, dtype: float64
start : 119
end : 122


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


The best order is (0, 1, 1)
MSE Error: 50405.19237
RMSE Error: 224.5110072
MAPE Error: 0.04845771623
Accuracy: 95.15422838
ADF test statistic       -1.736958
p-value                   0.412178
# lags used               3.000000
# observations          123.000000
critical value (1%)      -3.484667
critical value (5%)      -2.885340
critical value (10%)     -2.579463
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


The best order is (0, 1, 1)
        Date   Validation
0 2024-04-14  3349.401765
1 2024-04-21  3349.401765
2 2024-04-28  3349.401765
3 2024-05-05  3349.401765
          Date  TREATED WATER PRODUCTION IN m3/h       Type
0   2021-12-05                       2251.238000     Actual
1   2021-12-12                       2664.468036     Actual
2   2021-12-19                       2599.487857     Actual
3   2021-12-26                       2706.295833     Actual
4   2022-01-02                       2639.030357     Actual
..         ...                               ...        ...
126 2024-05-05                       3371.501667     Actual
127 2024-05-12                       3350.460000  Predicted
128 2024-05-19                       3350.460000  Predicted
129 2024-05-26                       3350.460000  Predicted
130 2024-06-02                       3350.460000  Predicted

[131 rows x 3 columns]
           Date  TREATED WATER PRODUCTION IN m3/h       Type   Validation
0    2021-12-05         

### TSA class with seasonality

In [14]:
class TSA_seasonal:
    def __init__(self, df):
        self.df = df

    def adf_test(self, valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(), autolag='AIC')  # .dropna() handles differenced data

        labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
        out = pd.Series(result[0:4], index=labels)

        for key, val in result[4].items():
            out[f'critical value ({key})'] = val

        if result[1] <= 0.05:
            state = "Stationary"
        else:
            state = "Non-stationary"
        return state

    def determine_ARIMA_order(self, valcol):
        stepwise_fit = auto_arima(self.df[valcol], seasonal=True, m=48,
                                  start_p=0, start_q=0,
                                  error_action='ignore',  # we don't want to know if an order does not work
                                  suppress_warnings=True,  # we don't want convergence warnings
                                  stepwise=True)  # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order

    def fit_model(self, valcol):
        train = self.df[valcol][:len(self.df[valcol]) - 8]
        test = self.df[valcol][len(self.df[valcol]) - 8:len(self.df[valcol]) - 4]
        val = self.df[valcol][len(self.df[valcol]) - 4:]

        results = ARIMA(train, order=self.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=len(train), end=len(train) + len(test) - 1)
        predictions_val = results.predict(start=len(train) + len(test), end=len(train) + len(test) + len(val) - 1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        
        return predictions_val

    def full_data_model(self, valcol):
        results = ARIMA(self.df[valcol], order=self.determine_ARIMA_order(valcol)).fit()
        fcast = results.forecast(steps=4).round(2)  # Forecast 4 steps ahead
#         print(fcast)
#         fcast_index = pd.date_range(start=self.df.index[-1], periods=4 + 1, freq='M')[1:]  # Assuming monthly data
        DF_fcast = pd.DataFrame({valcol: fcast})
        DF_fcast['Type'] = 'Predicted'
        print(DF_fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        # Concatenate original data and forecast data
        combined_DF = pd.concat([DF, DF_fcast])
        combined_DF = combined_DF.reset_index().rename(columns={'index':'Date'})
        print(combined_DF)
        DF_val = pd.DataFrame(c2.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
#         combined_DF = combined_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(combined_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  combined_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
#         print(combined_df)
        return final_DF
    
    
c2 = TSA_seasonal(data_weekly)
c2.adf_test("TREATED WATER PRODUCTION IN m3/h")
c2.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c2.fit_model("TREATED WATER PRODUCTION IN m3/h")
c2.full_data_model("TREATED WATER PRODUCTION IN m3/h")

The best order is (0, 1, 1)
The best order is (0, 1, 1)
MSE Error: 50405.19237
RMSE Error: 224.5110072
MAPE Error: 0.04845771623
Accuracy: 95.15422838


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


The best order is (0, 1, 1)
            TREATED WATER PRODUCTION IN m3/h       Type
2024-05-12                           3350.46  Predicted
2024-05-19                           3350.46  Predicted
2024-05-26                           3350.46  Predicted
2024-06-02                           3350.46  Predicted
          Date  TREATED WATER PRODUCTION IN m3/h       Type
0   2021-12-05                       2251.238000     Actual
1   2021-12-12                       2664.468036     Actual
2   2021-12-19                       2599.487857     Actual
3   2021-12-26                       2706.295833     Actual
4   2022-01-02                       2639.030357     Actual
..         ...                               ...        ...
126 2024-05-05                       3371.501667     Actual
127 2024-05-12                       3350.460000  Predicted
128 2024-05-19                       3350.460000  Predicted
129 2024-05-26                       3350.460000  Predicted
130 2024-06-02                  

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


The best order is (0, 1, 1)
MSE Error: 50405.19237
RMSE Error: 224.5110072
MAPE Error: 0.04845771623
Accuracy: 95.15422838
        Date   Validation
0 2024-04-14  3349.401765
1 2024-04-21  3349.401765
2 2024-04-28  3349.401765
3 2024-05-05  3349.401765
          Date  TREATED WATER PRODUCTION IN m3/h       Type
0   2021-12-05                       2251.238000     Actual
1   2021-12-12                       2664.468036     Actual
2   2021-12-19                       2599.487857     Actual
3   2021-12-26                       2706.295833     Actual
4   2022-01-02                       2639.030357     Actual
..         ...                               ...        ...
126 2024-05-05                       3371.501667     Actual
127 2024-05-12                       3350.460000  Predicted
128 2024-05-19                       3350.460000  Predicted
129 2024-05-26                       3350.460000  Predicted
130 2024-06-02                       3350.460000  Predicted

[131 rows x 3 columns]


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Unnamed: 0,Date,TREATED WATER PRODUCTION IN m3/h,Type,Validation
0,2021-12-05,2251.238000,Actual,
1,2021-12-12,2664.468036,Actual,
2,2021-12-19,2599.487857,Actual,
3,2021-12-26,2706.295833,Actual,
4,2022-01-02,2639.030357,Actual,
...,...,...,...,...
126,2024-05-05,3371.501667,Actual,3349.401765
127,2024-05-12,3350.460000,Predicted,
128,2024-05-19,3350.460000,Predicted,
129,2024-05-26,3350.460000,Predicted,
