In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima

In [None]:
#Loading dataset

In [None]:
df=pd.read_csv("final_data_in_ML.csv",parse_dates=['Standardized_Date'])

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df['Standardized_Date'].value_counts()

In [None]:
df['remarks category'].unique()

In [None]:
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))
df.dtypes

In [None]:
df=df[['DATETIME','CLEAR WATER PUMPING FLOW ML','REMARKS']]
df.set_index('DATETIME', inplace=True)
df

# Time Series Resampling

Daily Resampling

In [None]:
df_daily = df.resample('D').sum(numeric_only=True)
df_daily

In [None]:
df_daily.plot()

In [None]:
from pylab import rcParams
import matplotlib.pyplot as plt
import statsmodels.api as sm
rcParams['figure.figsize'] = 12, 8

decomposition = sm.tsa.seasonal_decompose(df_daily['CLEAR WATER PUMPING FLOW ML'], model='additive')


fig = decomposition.plot()

plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol],
#                                   start_p=1, start_q=1,max_p=3,max_q=3,trace=True,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
        train = self.df[valcol][:len(self.df[valcol])-6]
        test = self.df[valcol][len(self.df[valcol])-6:len(self.df[valcol])-3]
        val = self.df[valcol][len(self.df[valcol])-3:]
        # Assuming 'valcol' is the column name containing the data
#         train = self.df[self.df.index < '2024-06-30'][valcol]
#         val = self.df[(self.df.index >= '2024-06-30') & (self.df.index <= '2024-07-30')][valcol]
#         test = self.df[self.df.index > '2024-07-30'][valcol]

        start = len(train)
        end = len(train)+len(test)-1
        print('start : {}'.format(start))
        print('end : {}'.format(end))

        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
#         results = ARIMA(train,order=(2,1,1)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_test = results.predict(start=start, end=end).round(2)
        print("predictions_test",predictions_test)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1).round(2)

        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol)).fit()
        fcast = results.predict(len(self.df), len(self.df)+3).round(2)
        print(results.summary())
        
        print(fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()

        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        print(final_DF.tail(60))
        
c1 = TSA(df_daily)
c1.adf_test("CLEAR WATER PUMPING FLOW ML")
c1.determine_ARIMA_order("CLEAR WATER PUMPING FLOW ML")
c1.fit_model('CLEAR WATER PUMPING FLOW ML')
c1.full_data_model('CLEAR WATER PUMPING FLOW ML')

In [None]:
from prophet import Prophet
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

class TSA:
    def __init__(self, df):
        self.df = df

    def prepare_data_for_prophet(self, valcol):
        # Prepare the dataframe in the format required for Prophet: 'ds' for date and 'y' for values
        df_prophet = self.df[[valcol]].reset_index()
        df_prophet.columns = ['ds', 'y']
        return df_prophet

    def fit_prophet(self, valcol):
        # Prepare data
        df_prophet = self.prepare_data_for_prophet(valcol)

        # Split data into train, validation, and test
        train = df_prophet.iloc[:-6]
        test = df_prophet.iloc[-6:-3]
        val = df_prophet.iloc[-3:]

        # Fit the model
        model = Prophet()
        model.fit(train)

        # Create future dataframe for predictions
        future = model.make_future_dataframe(periods=6)

        # Forecast
        forecast = model.predict(future)
        forecast_test = forecast.iloc[-6:-3]['yhat'].round(2)
        forecast_val = forecast.iloc[-3:]['yhat'].round(2)

        # Calculate errors
        error1 = mean_squared_error(test['y'], forecast_test)
        error2 = rmse(test['y'], forecast_test)
        error3 = mean_absolute_percentage_error(test['y'], forecast_test)
        accuracy = (1 - error3) * 100

        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')

        return forecast_val

    def full_data_prophet(self, valcol):
        df_prophet = self.prepare_data_for_prophet(valcol)

        # Fit the model to the full dataset
        model = Prophet()
        model.fit(df_prophet)

        # Forecast the next 4 periods
        future = model.make_future_dataframe(periods=4)
        forecast = model.predict(future)
        forecast = forecast[['ds', 'yhat']].tail(4).round(2)

        # Print summary and forecast
        print(forecast)

        # Create a DataFrame for the actual and predicted data
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        forecast['Type'] = 'Predicted'
        forecast = forecast.rename(columns={'yhat': valcol})
        final_DF = pd.concat([DF, forecast.set_index('ds')])
        final_DF = final_DF.reset_index()

        # Add validation data
        DF_val = pd.DataFrame(self.fit_prophet(valcol))
        DF_val = DF_val.reset_index()
        
        # Ensure both 'Date' columns are of datetime type before merging
        DF_val = DF_val.rename(columns={'index': 'Date', 'yhat': 'Validation'})
        DF_val['Date'] = pd.to_datetime(DF_val['Date'])  # Convert to datetime if not already
        
        final_DF = final_DF.rename(columns={'index': 'Date'})
        final_DF['Date'] = pd.to_datetime(final_DF['Date'])  # Convert 'Date' to datetime

        # Merge on 'Date' after ensuring both columns are of datetime type
        final_DF = final_DF.merge(DF_val, on='Date', how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')  # Convert Date to string if needed for final output

        print(final_DF)
        print(final_DF.tail(60))


In [None]:
# Assuming df_daily is your dataframe and 'CLEAR WATER PUMPING FLOW ML' is the target column
c1 = TSA(df_daily)
c1.fit_prophet('CLEAR WATER PUMPING FLOW ML')
c1.full_data_prophet('CLEAR WATER PUMPING FLOW ML')


Weekly Resampling

In [None]:
df_weekly = df_daily.resample('W')['CLEAR WATER PUMPING FLOW ML'].sum()
df_weekly= df_weekly.reset_index()
df_weekly= df_weekly.set_index('DATETIME')
df_weekly

In [None]:
df_weekly.plot()

In [None]:
from pylab import rcParams
import matplotlib.pyplot as plt
import statsmodels.api as sm
rcParams['figure.figsize'] = 12, 8

decomposition = sm.tsa.seasonal_decompose(df_weekly['CLEAR WATER PUMPING FLOW ML'], model='additive')


fig = decomposition.plot()

plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol],
#                                   start_p=1, start_q=1,max_p=3,max_q=3,trace=True,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise      
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
        train = self.df[valcol][:len(self.df[valcol])-8]
        test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
        val = self.df[valcol][len(self.df[valcol])-4:]
        # Assuming 'valcol' is the column name containing the data
#         train = self.df[self.df.index < '2024-06-30'][valcol]
#         val = self.df[(self.df.index >= '2024-06-30') & (self.df.index <= '2024-07-30')][valcol]
#         test = self.df[self.df.index > '2024-07-30'][valcol]
        start = len(train)
        end = len(train)+len(test)-1
        print('start : {}'.format(start))
        print('end : {}'.format(end))
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol),trend='t').fit()
#         results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_test = results.predict(start=start, end=end).round(2)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1).round(2)
        
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')

        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol),trend='t').fit()
        fcast = results.predict(len(self.df), len(self.df)+3).round(2)
        print(fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()

        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        print(final_DF.tail(50))
        
c1 = TSA(df_weekly)
c1.adf_test("CLEAR WATER PUMPING FLOW ML")
c1.determine_ARIMA_order("CLEAR WATER PUMPING FLOW ML")
c1.fit_model('CLEAR WATER PUMPING FLOW ML')
c1.full_data_model('CLEAR WATER PUMPING FLOW ML')

Monthly Resampling

In [None]:
df_monthly = df.resample('M')['CLEAR WATER PUMPING FLOW ML'].sum()
df_monthly= df_monthly.reset_index()
df_monthly= df_monthly.set_index('DATETIME')
df_monthly

In [None]:
df_monthly.plot()

In [None]:
import matplotlib.pyplot as plt

# Calculate and plot the trend using a rolling average (e.g., 6 months)
df_monthly['Trend'] = df_monthly['CLEAR WATER PUMPING FLOW ML'].rolling(window=6).mean()

# Create the figure and axis objects
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the original data with a specified color (e.g., blue)
df_monthly['CLEAR WATER PUMPING FLOW ML'].plot(ax=ax, label='Monthly Flow', color='blue')

# Plot the trend line with a different color (e.g., orange)
df_monthly['Trend'].plot(ax=ax, label='Trend (6-month Rolling Avg)', color='orange')

# Add labels, title, and legend
plt.title('Monthly CLEAR WATER PUMPING FLOW ML with Trend')
plt.xlabel('Date')
plt.ylabel('CLEAR WATER PUMPING FLOW ML')
plt.legend()

# Show the plot
plt.show()


In [None]:
from pylab import rcParams
import matplotlib.pyplot as plt
import statsmodels.api as sm
rcParams['figure.figsize'] = 12, 8

decomposition = sm.tsa.seasonal_decompose(df_monthly['CLEAR WATER PUMPING FLOW ML'], model='additive')


fig = decomposition.plot()

plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol],
#                                   start_p=1, start_q=1,max_p=3,max_q=3,trace=True,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
        train = self.df[valcol][:len(self.df[valcol])-8]
        test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
        val = self.df[valcol][len(self.df[valcol])-4:]
        # Assuming 'valcol' is the column name containing the data
#         train = self.df[self.df.index < '2024-06-30'][valcol]
#         val = self.df[(self.df.index >= '2024-06-30') & (self.df.index <= '2024-07-30')][valcol]
#         test = self.df[self.df.index > '2024-07-30'][valcol]
        start = len(train)
        end = len(train)+len(test)-1
        print('start : {}'.format(start))
        print('end : {}'.format(end))
#         results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol),trend='t').fit()
        predictions = results.predict(start=start, end=end)
        predictions_test = results.predict(start=start, end=end).round(2)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1).round(2)

        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
        
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol),trend='t').fit()
#         if len(self.df[valcol]) > 70:
        fcast = results.predict(len(self.df), len(self.df)+3).round(2)
        
#         else:
#             fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         ax = self.df[valcol].plot(legend=True,figsize=(12,6))
#         fcast.plot(legend=True)
        print(fcast)
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
       
        
#         DF = DF.reset_index()
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()
#         DF_fcast = DF_fcast.rename(columns={'predicted_mean':'Predicted', 'index':'DateTime'})
        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        print(final_DF.tail(50))
        
c1 = TSA(df_monthly)
c1.adf_test("CLEAR WATER PUMPING FLOW ML")
c1.determine_ARIMA_order("CLEAR WATER PUMPING FLOW ML")
c1.fit_model('CLEAR WATER PUMPING FLOW ML')
c1.full_data_model('CLEAR WATER PUMPING FLOW ML')