In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima

In [None]:
#Loading dataset

In [None]:
df=pd.read_csv("final_data_in_ML.csv",parse_dates=['Standardized_Date'])
df

In [None]:
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))
df.dtypes

In [None]:
df=df[['DATETIME','CLEAR WATER PUMPING FLOW ML','REMARKS']]
df.set_index('DATETIME', inplace=True)
df

In [None]:
df_daily = df.resample('W').sum(numeric_only=True)
df_daily

In [None]:
df_2022=df_daily.loc['2022']
df_2022.plot()

In [None]:
df_2023=df_daily.loc['2023']
df_2023.plot()

In [None]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(df_daily['CLEAR WATER PUMPING FLOW ML'].dropna(),autolag='AIC') # .dropna() handles differenced data

labels = ['Adf_daily test statistic','p-value','# lags used','# observations']
out = pd.Series(result[0:4],index=labels)

for key,val in result[4].items():
    out[f'critical value ({key})']=val

print(out.to_string())          # .to_string() removes the line "dtype: float64"

if result[1] <= 0.05:
    print("Strong evidence against the null hypothesis")
    print("Reject the null hypothesis")
    print("Data has no unit root and is stationary")
    state = "Stationary"
else:
    print("Weak evidence against the null hypothesis")
    print("Fail to reject the null hypothesis")
    print("Data has a unit root and is non-stationary")
    state = "Non-stationary"

In [None]:
from statsmodels.tsa.stattools import adfuller

def find_order_of_differencing(series, max_d=2):
    """ Automatically find the order of differencing (d) that makes the series stationary """
    d = 0  # Start with no differencing
    
    # Perform the ADF test on the original series
    result = adfuller(series.dropna(), autolag='AIC')
    
    # Check if the original series is stationary
    if result[1] <= 0.05:
        print(f"Data is already stationary at d={d}")
        return d
    
    # Apply differencing iteratively
    while d < max_d:
        d += 1
        series_diff = series.diff(d).dropna()  # Apply d-th order differencing
        
        # Perform ADF test on differenced data
        result_diff = adfuller(series_diff, autolag='AIC')
        
        if result_diff[1] <= 0.05:
            print(f"Data is stationary after {d} order differencing (d={d})")
            return d
        else:
            print(f"Data is still non-stationary at d={d}")
    
    print(f"Max differencing reached (d={max_d}) without stationarity")
    return d  # Return the maximum d attempted if no stationarity is found

# Example usage:
series = df_daily['CLEAR WATER PUMPING FLOW ML']  # Replace with your actual time series column
d_value = find_order_of_differencing(series)
print(f"Optimal value of d: {d_value}")


In [None]:
# import itertools
# from statsmodels.tsa.arima.model import ARIMA

# # Define the range of p, d, q values to try
# p = range(0, 5)  # Try values from 0 to 4 for p
# d = [d_value]    # Use the differencing order found previously
# q = range(0, 5)  # Try values from 0 to 4 for q

# # Generate all possible combinations of p, d, q values
# pdq_combinations = list(itertools.product(p, d, q))

# # Search for the best ARIMA model based on AIC
# best_aic = float('inf')
# best_order = None

# for param in pdq_combinations:
#     try:
#         model = ARIMA(df_daily['CLEAR WATER PUMPING FLOW ML'], order=param)
#         model_fit = model.fit()
#         aic = model_fit.aic
        
#         if aic < best_aic:
#             best_aic = aic
#             best_order = param
#     except:
#         continue

# print(f'Best ARIMA order based on AIC: {best_order} with AIC={best_aic}')


In [None]:
# import pmdarima as pm
# model = pm.auto_arima(df_daily['CLEAR WATER PUMPING FLOW ML'],seasonal=True,
#                           start_p=0, start_q=0,max_order=4,test='adf',trace=True,
#                           error_action='ignore',   # we don't want to know if an order does not work
#                           suppress_warnings=True,  # we don't want convergence warnings
#                           stepwise=True)           # set to stepwise
       
# best_order = stepwise_fit.get_params().get('order')
# print('The best order is {}'.format(best_order))
# return best_order

In [None]:
import pmdarima as pm

# Try different values of m
for m_value in [7, 12, 52]:  # Try common seasonal periods
    print(f"Trying m = {m_value}")
    model = pm.auto_arima(df_daily['CLEAR WATER PUMPING FLOW ML'], seasonal=True, m=m_value, start_p=0, start_q=0, max_order=4, 
                          test='adf', trace=True, error_action='ignore', suppress_warnings=True, stepwise=True)
    print(model.summary())


In [None]:
# Daily
# ARIMA(2,0,0)(2,0,2)[7] intercept   : AIC=7327.873, Time=4.57 sec
# ARIMA(1,0,1)(2,0,0)[12] intercept   : AIC=7342.371, Time=7.03 sec
# ARIMA(3,0,1)(1,0,0)[52]             : AIC=7303.866, Time=14.97 sec

# Monthly
# ARIMA(2,2,1)(0,0,0)[7]             : AIC=417.601, Time=0.18 sec
# ARIMA(2,2,0)(1,0,0)[12]             : AIC=415.457, Time=0.08 sec

#Weekly
# ARIMA(0,1,1)(0,0,0)[7]             : AIC=1540.688, Time=0.05 sec
# ARIMA(0,1,1)(0,0,0)[12]             : AIC=1540.688, Time=0.06 sec
# ARIMA(0,1,1)(0,0,0)[52]             : AIC=1540.688, Time=0.04 sec

In [None]:
# Best model:  ARIMA(2,0,0)(2,0,2)[7] intercept
# Best model:  ARIMA(1,0,1)(2,0,0)[12] intercept

In [None]:
model.summary() 

In [None]:
# train = df_daily['CLEAR WATER PUMPING FLOW ML'][:len(df_daily['CLEAR WATER PUMPING FLOW ML'])-4]
# test = df_daily['CLEAR WATER PUMPING FLOW ML'][len(df_daily['CLEAR WATER PUMPING FLOW ML'])-4:]

In [None]:
# train.shape

In [None]:
# test.shape

In [None]:
# model.fit(train)

In [None]:
# forecast=model.predict(n_periods=4)
# forecast

In [None]:
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

train = df_daily['CLEAR WATER PUMPING FLOW ML'][:len(df_daily['CLEAR WATER PUMPING FLOW ML'])-4]
test = df_daily['CLEAR WATER PUMPING FLOW ML'][len(df_daily['CLEAR WATER PUMPING FLOW ML'])-4:]
# val = df_daily['CLEAR WATER PUMPING FLOW ML'][len(df_daily['CLEAR WATER PUMPING FLOW ML'])-2:]

start = len(train)
end = len(train)+len(test)-1
print("test",test)

# results = ARIMA(train,order=(3,2,3)).fit()
results = model.fit(train)
# predictions = results.predict(start=start, end=end).round(2)
predictions = results.predict(n_periods=4)
print("predictions",predictions)
# predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1).round(2)
# predictions_val = results.predict(n_periods=2)
# print("val",val)
# print("predictions_val",predictions_val)


error1 = mean_squared_error(test, predictions)
error2 = rmse(test, predictions)
error3 = mean_absolute_percentage_error(test,predictions)
accuracy = (1-error3)*100
print(f'MSE Error: {error1:11.10}')
print(f'RMSE Error: {error2:11.10}')
print(f'MAPE Error: {error3:11.10}')
print(f'Accuracy: {accuracy:11.10}')

In [None]:
# Accuracy: 95.66054463
# Accuracy:  96.3790015

In [None]:
valcol='CLEAR WATER PUMPING FLOW ML'
# results = ARIMA(df_daily[valcol],order=(3,2,3)).fit()
results = model.fit(df_daily)
# fcast = results.predict(len(df_daily), len(df_daily)+3).round(2)
fcast = results.predict(n_periods=4)

DF = pd.DataFrame(df_daily[valcol])
DF['Type'] = 'Actual'
DF_fcast = pd.DataFrame(fcast)
DF_fcast['Type'] = 'Predicted'

DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
final_DF = pd.concat([DF,DF_fcast])
final_DF = final_DF.reset_index()

# DF_val = pd.DataFrame(predictions_val)
# DF_val = DF_val.reset_index()
# DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
final_DF = final_DF.rename(columns={'index':'Date'})
# final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
print(final_DF.tail(60))

In [None]:
# Predicted  65.737006
# 1003 2024-08-30                          NaN  Predicted  64.925104
# 1004 2024-08-31                          NaN  Predicted  63.479085
# 1005 2024-09-01                          NaN  Predicted  66.418586

# 1002 2024-08-29                          NaN  Predicted  65.227239
# 1003 2024-08-30                          NaN  Predicted  64.645529
# 1004 2024-08-31                          NaN  Predicted  63.547374
# 1005 2024-09-01                          NaN  Predicted  62.155024
