In [25]:
import pandas as pd

from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

In [26]:
data = pd.read_csv('df_treated.csv')
data.head()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,2021-12-01,01:00:00,3397.39,No Remarks
1,2021-12-01,02:00:00,3378.46,No Remarks
2,2021-12-01,03:00:00,3392.56,No Remarks
3,2021-12-01,04:00:00,3402.36,No Remarks
4,2021-12-01,05:00:00,3394.38,No Remarks


In [27]:
data.shape

(21169, 4)

In [28]:
data.isna().sum()

DATE                                0
TIME                                0
TREATED WATER PRODUCTION IN m3/h    0
REMARKS                             0
dtype: int64

In [29]:
data['DATETIME'] = data['DATE']+' '+ data['TIME']
data.head()

Unnamed: 0,DATE,TIME,TREATED WATER PRODUCTION IN m3/h,REMARKS,DATETIME
0,2021-12-01,01:00:00,3397.39,No Remarks,2021-12-01 01:00:00
1,2021-12-01,02:00:00,3378.46,No Remarks,2021-12-01 02:00:00
2,2021-12-01,03:00:00,3392.56,No Remarks,2021-12-01 03:00:00
3,2021-12-01,04:00:00,3402.36,No Remarks,2021-12-01 04:00:00
4,2021-12-01,05:00:00,3394.38,No Remarks,2021-12-01 05:00:00


In [30]:
data = data[['DATETIME','TREATED WATER PRODUCTION IN m3/h','REMARKS']]
data.head()

Unnamed: 0,DATETIME,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,2021-12-01 01:00:00,3397.39,No Remarks
1,2021-12-01 02:00:00,3378.46,No Remarks
2,2021-12-01 03:00:00,3392.56,No Remarks
3,2021-12-01 04:00:00,3402.36,No Remarks
4,2021-12-01 05:00:00,3394.38,No Remarks


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21169 entries, 0 to 21168
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   DATETIME                          21169 non-null  object 
 1   TREATED WATER PRODUCTION IN m3/h  21169 non-null  float64
 2   REMARKS                           21169 non-null  object 
dtypes: float64(1), object(2)
memory usage: 496.3+ KB


In [32]:
data['DATETIME'] = pd.to_datetime(data['DATETIME'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21169 entries, 0 to 21168
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   DATETIME                          21169 non-null  datetime64[ns]
 1   TREATED WATER PRODUCTION IN m3/h  21169 non-null  float64       
 2   REMARKS                           21169 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 496.3+ KB


In [33]:
data.set_index('DATETIME',inplace=True)
data.head()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-01 01:00:00,3397.39,No Remarks
2021-12-01 02:00:00,3378.46,No Remarks
2021-12-01 03:00:00,3392.56,No Remarks
2021-12-01 04:00:00,3402.36,No Remarks
2021-12-01 05:00:00,3394.38,No Remarks


In [34]:
data_daily = data.resample('24H')['TREATED WATER PRODUCTION IN m3/h'].mean()
data_daily = data_daily.reset_index()
data_daily = data_daily.set_index('DATETIME')
data_daily.head()

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2021-12-01,2060.383913
2021-12-02,3005.474
2021-12-03,1968.085833
2021-12-04,1960.905417
2021-12-05,2221.962083


In [35]:
data_daily[data_daily['TREATED WATER PRODUCTION IN m3/h']==0.0]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2022-01-10,0.0
2022-06-15,0.0
2022-06-16,0.0
2024-01-04,0.0
2024-01-05,0.0


In [36]:
data[(data.index>='2024-01-04 00:00:00')&(data.index<'2024-01-06 00:00:00')]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-04 01:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 02:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 03:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 04:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 05:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 06:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 07:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 08:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 09:00:00,0.0,Pumping stopped for maintanance work & NHAI work.
2024-01-04 10:00:00,0.0,Pumping stopped for maintanance work & NHAI work.


In [37]:
data_daily[data_daily['TREATED WATER PRODUCTION IN m3/h'].isna()]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h
DATETIME,Unnamed: 1_level_1
2022-05-28,
2023-10-09,


In [38]:
data[(data.index>='2023-10-09 00:00:00')&(data.index<'2023-10-10 00:00:00')]

Unnamed: 0_level_0,TREATED WATER PRODUCTION IN m3/h,REMARKS
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1


In [39]:
data_daily['TREATED WATER PRODUCTION IN m3/h'].fillna(0.0,inplace=True)
data_daily.isna().sum()

TREATED WATER PRODUCTION IN m3/h    0
dtype: int64

In [40]:
data_daily.shape

(882, 1)

In [22]:
class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol], start_p=0, start_q=0,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
#         if len(self.df[valcol]) > 70:
        train = self.df[valcol][:len(self.df[valcol])-60]#30 test + 30 val =60 train (means except last 60 rows all others taken as train)
        test = self.df[valcol][len(self.df[valcol])-60:len(self.df[valcol])-30] #(60-30=30)second last 30 rows 
        val = self.df[valcol][len(self.df[valcol])-30:] #last 30 rows
#         else:
#             train = self.df[valcol][:len(self.df[valcol])-8]
#             test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
#             val = self.df[valcol][len(self.df[valcol])-4:]
        start = len(train)
        end = len(train)+len(test)-1
#             print('train : {}'.format(train))
#             print('test : {}'.format(test))
        print('start : {}'.format(start))
        print('end : {}'.format(end))
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
#         return start, len(test)
#         model_results = {"Model":"ARIMA","Stationary": c1.adf_test(valcol),"X_train": str(len(train))+" Weeks", "X_test": str(len(test))+" Weeks", "X_validation": str(len(val))+" Weeks", "ARIMA_order": c1.determine_ARIMA_order(valcol), "MSE": error1,"RMSE": error2,"MAPE":error3, "Accuracy":((1-error3)*100).round(0)}
#         with open('TSA_AQI_model_result_new.json','a') as f:
#             f.write(str(model_results)+',')
#             f.close()
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol)).fit()
#         if len(self.df[valcol]) > 70:
        fcast = results.predict(len(self.df), len(self.df)+29,typ='levels').round(2)
#         else:
#             fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         ax = self.df[valcol].plot(legend=True,figsize=(12,6))
#         fcast.plot(legend=True)
        print(fcast)
#         error_rmse = rmse(self.df[valcol],fcast[0:len(self.df)])
#         error_mse = mean_squared_error(self.df[valcol],fcast[0:len(self.df)])
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
#         DF = DF.reset_index()
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()
#         DF_fcast = DF_fcast.rename(columns={'predicted_mean':'Predicted', 'index':'DateTime'})
        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        
#         final_DF.to_json('/Users/syaminiv/Library/CloudStorage/OneDrive-BayesianWaysLLP/Documents/SiAP_ML_Application/Final_Notebooks/June2023/JSON/TSA_withval_Zone_{}.json'.format(item),orient='records')

c1 = TSA(data_daily)
c1.adf_test("TREATED WATER PRODUCTION IN m3/h")
c1.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c1.fit_model("TREATED WATER PRODUCTION IN m3/h")
c1.full_data_model("TREATED WATER PRODUCTION IN m3/h")

ADF test statistic       -2.669453
p-value                   0.079475
# lags used              18.000000
# observations          863.000000
critical value (1%)      -3.437950
critical value (5%)      -2.864895
critical value (10%)     -2.568556
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
The best order is (2, 1, 1)
start : 822
end : 851
The best order is (2, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


MSE Error:  80451.8808
RMSE Error: 283.6404076
MAPE Error: 0.04552011518
Accuracy: 95.44798848
ADF test statistic       -2.669453
p-value                   0.079475
# lags used              18.000000
# observations          863.000000
critical value (1%)      -3.437950
critical value (5%)      -2.864895
critical value (10%)     -2.568556
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
The best order is (2, 1, 1)
The best order is (2, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


2024-05-01    3349.97
2024-05-02    3340.47
2024-05-03    3334.76
2024-05-04    3332.26
2024-05-05    3331.06
2024-05-06    3330.50
2024-05-07    3330.23
2024-05-08    3330.11
2024-05-09    3330.05
2024-05-10    3330.02
2024-05-11    3330.01
2024-05-12    3330.00
2024-05-13    3330.00
2024-05-14    3330.00
2024-05-15    3330.00
2024-05-16    3330.00
2024-05-17    3330.00
2024-05-18    3330.00
2024-05-19    3330.00
2024-05-20    3330.00
2024-05-21    3330.00
2024-05-22    3330.00
2024-05-23    3330.00
2024-05-24    3330.00
2024-05-25    3330.00
2024-05-26    3330.00
2024-05-27    3330.00
2024-05-28    3330.00
2024-05-29    3330.00
2024-05-30    3330.00
Freq: D, Name: predicted_mean, dtype: float64
start : 822
end : 851
The best order is (2, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


MSE Error:  80451.8808
RMSE Error: 283.6404076
MAPE Error: 0.04552011518
Accuracy: 95.44798848
ADF test statistic       -2.669453
p-value                   0.079475
# lags used              18.000000
# observations          863.000000
critical value (1%)      -3.437950
critical value (5%)      -2.864895
critical value (10%)     -2.568556
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
The best order is (2, 1, 1)
         Date   Validation
0  2024-04-01  3255.278608
1  2024-04-02  3255.278608
2  2024-04-03  3255.278608
3  2024-04-04  3255.278608
4  2024-04-05  3255.278608
5  2024-04-06  3255.278608
6  2024-04-07  3255.278608
7  2024-04-08  3255.278608
8  2024-04-09  3255.278608
9  2024-04-10  3255.278608
10 2024-04-11  3255.278608
11 2024-04-12  3255.278608
12 2024-04-13  3255.278608
13 2024-04-14  3255.278608
14 2024-04-15  3255.278608
15 2024-04-16  3255.278608
16 2024-04-17  3255.278608
17 2024-04-18  3255.278608

### TSA class with seasonality

In [24]:
class TSA_seasonal:
    def __init__(self, df):
        self.df = df

    def adf_test(self, valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(), autolag='AIC')  # .dropna() handles differenced data

        labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
        out = pd.Series(result[0:4], index=labels)

        for key, val in result[4].items():
            out[f'critical value ({key})'] = val

        if result[1] <= 0.05:
            state = "Stationary"
        else:
            state = "Non-stationary"
        return state

    def determine_ARIMA_order(self, valcol):
        stepwise_fit = auto_arima(self.df[valcol], seasonal=True, m=7,
                                  start_p=0, start_q=0,
                                  error_action='ignore',  # we don't want to know if an order does not work
                                  suppress_warnings=True,  # we don't want convergence warnings
                                  stepwise=True)  # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order

    def fit_model(self, valcol):
        train = self.df[valcol][:len(self.df[valcol]) - 60]
        test = self.df[valcol][len(self.df[valcol]) - 60:len(self.df[valcol]) - 30]
        val = self.df[valcol][len(self.df[valcol]) - 30:]

        results = ARIMA(train, order=self.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=len(train), end=len(train) + len(test) - 1)
        predictions_val = results.predict(start=len(train) + len(test), end=len(train) + len(test) + len(val) - 1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        
        return predictions_val

    def full_data_model(self, valcol):
        results = ARIMA(self.df[valcol], order=self.determine_ARIMA_order(valcol)).fit()
        fcast = results.forecast(steps=30).round(2)  # Forecast 4 steps ahead
#         print(fcast)
#         fcast_index = pd.date_range(start=self.df.index[-1], periods=4 + 1, freq='M')[1:]  # Assuming monthly data
        DF_fcast = pd.DataFrame({valcol: fcast})
        DF_fcast['Type'] = 'Predicted'
        print(DF_fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        # Concatenate original data and forecast data
        combined_DF = pd.concat([DF, DF_fcast])
        combined_DF = combined_DF.reset_index().rename(columns={'index':'Date'})
        print(combined_DF)
        DF_val = pd.DataFrame(c2.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
#         combined_DF = combined_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(combined_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  combined_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
#         print(combined_df)
        return final_DF
    
    
c2 = TSA_seasonal(data_daily)
c2.adf_test("TREATED WATER PRODUCTION IN m3/h")
c2.determine_ARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c2.fit_model("TREATED WATER PRODUCTION IN m3/h")
c2.full_data_model("TREATED WATER PRODUCTION IN m3/h")

The best order is (2, 1, 1)
The best order is (2, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


MSE Error:  80451.8808
RMSE Error: 283.6404076
MAPE Error: 0.04552011518
Accuracy: 95.44798848
ADF test statistic       -2.669453
p-value                   0.079475
# lags used              18.000000
# observations          863.000000
critical value (1%)      -3.437950
critical value (5%)      -2.864895
critical value (10%)     -2.568556
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
The best order is (2, 1, 1)
The best order is (2, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


            TREATED WATER PRODUCTION IN m3/h       Type
2024-05-01                           3349.97  Predicted
2024-05-02                           3340.47  Predicted
2024-05-03                           3334.76  Predicted
2024-05-04                           3332.26  Predicted
2024-05-05                           3331.06  Predicted
2024-05-06                           3330.50  Predicted
2024-05-07                           3330.23  Predicted
2024-05-08                           3330.11  Predicted
2024-05-09                           3330.05  Predicted
2024-05-10                           3330.02  Predicted
2024-05-11                           3330.01  Predicted
2024-05-12                           3330.00  Predicted
2024-05-13                           3330.00  Predicted
2024-05-14                           3330.00  Predicted
2024-05-15                           3330.00  Predicted
2024-05-16                           3330.00  Predicted
2024-05-17                           3330.00  Pr

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


MSE Error:  80451.8808
RMSE Error: 283.6404076
MAPE Error: 0.04552011518
Accuracy: 95.44798848
ADF test statistic       -2.669453
p-value                   0.079475
# lags used              18.000000
# observations          863.000000
critical value (1%)      -3.437950
critical value (5%)      -2.864895
critical value (10%)     -2.568556
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary
The best order is (2, 1, 1)
         Date   Validation
0  2024-04-01  3255.278608
1  2024-04-02  3255.278608
2  2024-04-03  3255.278608
3  2024-04-04  3255.278608
4  2024-04-05  3255.278608
5  2024-04-06  3255.278608
6  2024-04-07  3255.278608
7  2024-04-08  3255.278608
8  2024-04-09  3255.278608
9  2024-04-10  3255.278608
10 2024-04-11  3255.278608
11 2024-04-12  3255.278608
12 2024-04-13  3255.278608
13 2024-04-14  3255.278608
14 2024-04-15  3255.278608
15 2024-04-16  3255.278608
16 2024-04-17  3255.278608
17 2024-04-18  3255.278608

Unnamed: 0,Date,TREATED WATER PRODUCTION IN m3/h,Type,Validation
0,2021-12-01,2060.383913,Actual,
1,2021-12-02,3005.474000,Actual,
2,2021-12-03,1968.085833,Actual,
3,2021-12-04,1960.905417,Actual,
4,2021-12-05,2221.962083,Actual,
...,...,...,...,...
907,2024-05-26,3330.000000,Predicted,
908,2024-05-27,3330.000000,Predicted,
909,2024-05-28,3330.000000,Predicted,
910,2024-05-29,3330.000000,Predicted,


In [42]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
class TSA_seasonal:
    def __init__(self, df):
        self.df = df

    def adf_test(self, valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(), autolag='AIC')  # .dropna() handles differenced data

        labels = ['ADF test statistic', 'p-value', '# lags used', '# observations']
        out = pd.Series(result[0:4], index=labels)
       

        for key, val in result[4].items():
            out[f'critical value ({key})'] = val

        if result[1] <= 0.05:
            state = "Stationary"
        else:
            state = "Non-stationary"
        return state
    
#     def find_D(self, valcol):
#         # Estimate number of seasonal differences using a Canova-Hansen test
#         D = nsdiffs(self.df[valcol], m=12, test='ch')
#         return D
    

    def determine_SARIMA_order(self, valcol):
#         D = self.find_D(valcol)
        stepwise_fit = auto_arima(self.df[valcol], seasonal=True, m=7,
                               start_p=0, start_q=0,
#                               start_P=0, start_Q=0,
#                                   D=D
                                  error_action='ignore',  # we don't want to know if an order does not work
                                  suppress_warnings=True,  # we don't want convergence warnings
                                  stepwise=True)  # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        best_seasonal_order = stepwise_fit.get_params().get('seasonal_order')
        print('The best seasonal order is {}'.format(best_seasonal_order))
        print('The best order is {}'.format(best_order))
        return best_order, best_seasonal_order


    def fit_model(self, valcol):
    # Split the data into train, test, and validation sets
        train = self.df[valcol][:len(self.df[valcol]) - 60]
        test = self.df[valcol][len(self.df[valcol]) - 60:len(self.df[valcol]) - 30]
        val = self.df[valcol][len(self.df[valcol]) - 30:]

        # Determine the best SARIMA order
        best_order, best_seasonal_order = self.determine_SARIMA_order(valcol)

        # Fit the SARIMA model on the training data
        model = SARIMAX(train, order=best_order, seasonal_order=best_seasonal_order)
        results = model.fit()

        # Generate predictions for the test set
        predictions = results.predict(start=len(train), end=len(train) + len(test) - 1)

        # Generate predictions for the validation set
        predictions_val = results.predict(start=len(train) + len(test), end=len(train) + len(test) + len(val) - 1)

        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        return predictions_val

    def full_data_model(self, valcol):
        best_order, best_seasonal_order = self.determine_SARIMA_order(valcol)
        model = SARIMAX(self.df[valcol], order=best_order, seasonal_order=best_seasonal_order)
        results = model.fit()
        fcast = results.forecast(steps=30).round(2)  # Forecast 4 steps ahead
#         print(fcast)
#         fcast_index = pd.date_range(start=self.df.index[-1], periods=4 + 1, freq='M')[1:]  # Assuming monthly data
        DF_fcast = pd.DataFrame({valcol: fcast})
        DF_fcast['Type'] = 'Predicted'
        print(DF_fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        # Concatenate original data and forecast data
        combined_DF = pd.concat([DF, DF_fcast])
        combined_DF = combined_DF.reset_index().rename(columns={'index':'Date'})
        print(combined_DF)
        DF_val = pd.DataFrame(c2.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
#         combined_DF = combined_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(combined_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  combined_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
#         print(combined_df)
        return final_DF
    
    
c2 = TSA_seasonal(data_daily)
c2.adf_test("TREATED WATER PRODUCTION IN m3/h")
# c2.find_D('TREATED WATER PRODUCTION IN m3/h')
c2.determine_SARIMA_order("TREATED WATER PRODUCTION IN m3/h")
c2.fit_model("TREATED WATER PRODUCTION IN m3/h")
c2.full_data_model("TREATED WATER PRODUCTION IN m3/h")

The best seasonal order is (0, 0, 0, 7)
The best order is (2, 1, 1)
The best seasonal order is (0, 0, 0, 7)
The best order is (2, 1, 1)
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  7.73239D+00    |proj g|=  7.61453D-02

At iterate    5    f=  7.68513D+00    |proj g|=  3.14932D-02

At iterate   10    f=  7.68217D+00    |proj g|=  2.60888D-04


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 This problem is unconstrained.



At iterate   15    f=  7.68175D+00    |proj g|=  4.48720D-03

At iterate   20    f=  7.67674D+00    |proj g|=  3.05314D-03

At iterate   25    f=  7.67602D+00    |proj g|=  1.60217D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     25     31      1     0     0   1.602D-05   7.676D+00
  F =   7.6760188253312327     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
MSE Error:  80451.8808
RMSE Error: 283.6404076
MAPE Error: 0.04552011518
Accuracy: 95.44798848
The best seasonal order is (0, 0, 0, 7)
The best order is (2, 1, 1)
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 This problem is unconstrained.


At iterate   15    f=  7.67033D+00    |proj g|=  9.29679D-04

At iterate   20    f=  7.66763D+00    |proj g|=  1.18688D-02

At iterate   25    f=  7.66425D+00    |proj g|=  3.67601D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     27     33      1     0     0   4.357D-06   7.664D+00
  F =   7.6642513270806862     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
            TREATED WATER PRODUCTION IN m3/h       Type
2024-05-01                           3349.97  Predicted
2024-05-02                           3340.47  Predicted
2024-05-03                           3334.76  Predicted
2024-05-04    

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 This problem is unconstrained.



At iterate   20    f=  7.67674D+00    |proj g|=  3.05314D-03

At iterate   25    f=  7.67602D+00    |proj g|=  1.60217D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     25     31      1     0     0   1.602D-05   7.676D+00
  F =   7.6760188253312327     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
MSE Error:  80451.8808
RMSE Error: 283.6404076
MAPE Error: 0.04552011518
Accuracy: 95.44798848
         Date   Validation
0  2024-04-01  3255.278608
1  2024-04-02  3255.278608
2  2024-04-03  3255.278608
3  2024-04-04  3255.278608
4  2024-04-05  3255.278608
5  2024-04-06  3255.278608
6  2024-04-07  

Unnamed: 0,Date,TREATED WATER PRODUCTION IN m3/h,Type,Validation
0,2021-12-01,2060.383913,Actual,
1,2021-12-02,3005.474000,Actual,
2,2021-12-03,1968.085833,Actual,
3,2021-12-04,1960.905417,Actual,
4,2021-12-05,2221.962083,Actual,
...,...,...,...,...
907,2024-05-26,3330.000000,Predicted,
908,2024-05-27,3330.000000,Predicted,
909,2024-05-28,3330.000000,Predicted,
910,2024-05-29,3330.000000,Predicted,
