In [None]:
import pandas as pd
df=pd.read_csv("final_data_in_ML.csv",parse_dates=['Standardized_Date'])
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))
df=df[['DATETIME','CLEAR WATER PUMPING FLOW ML','remarks category']]
df.set_index('DATETIME', inplace=True)
df

In [None]:
df_daily = df.resample('D').sum(numeric_only=True)
df_daily

# 1 month rolling mean

In [None]:
# Calculate the 30-day rolling mean for 'CLEAR WATER PUMPING FLOW ML'
df_daily['Rolling_Mean'] = df_daily['CLEAR WATER PUMPING FLOW ML'].rolling(window=30, min_periods=1).mean()
df_daily

In [None]:
# Replace values below 40 with the 30-day rolling mean
df_daily['CLEAR WATER PUMPING FLOW ML'] = df_daily.apply(
    lambda row: row['Rolling_Mean'] if row['CLEAR WATER PUMPING FLOW ML'] < 40 else row['CLEAR WATER PUMPING FLOW ML'], axis=1
)


In [None]:
df_daily.loc['2024-08']

In [None]:
df_daily.loc['2022-07']

In [None]:
2022-07-04	33.713360
2022-07-06	36.777820
2022-07-07	7.301890
2022-07-08	28.993830
2022-07-14	39.644030
2022-07-15	37.129840
2022-07-16	32.285300
2022-07-18	33.522280
2022-07-19	37.071320
2022-07-20	30.800720
2022-07-21	22.001960
2022-07-27	39.910050

In [None]:
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol], start_p=0, start_q=0,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
#         if len(self.df[valcol]) > 70:
        train = self.df[valcol][:len(self.df[valcol])-60]#30 test + 30 val =60 train (means except last 60 rows all others taken as train)
        test = self.df[valcol][len(self.df[valcol])-60:len(self.df[valcol])-30] #(60-30=30)second last 30 rows 
        val = self.df[valcol][len(self.df[valcol])-30:] #last 30 rows
#         else:
#             train = self.df[valcol][:len(self.df[valcol])-8]
#             test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
#             val = self.df[valcol][len(self.df[valcol])-4:]
        start = len(train)
        end = len(train)+len(test)-1
#             print('train : {}'.format(train))
#             print('test : {}'.format(test))
        print('start : {}'.format(start))
        print('end : {}'.format(end))
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
#         return start, len(test)
#         model_results = {"Model":"ARIMA","Stationary": c1.adf_test(valcol),"X_train": str(len(train))+" Weeks", "X_test": str(len(test))+" Weeks", "X_validation": str(len(val))+" Weeks", "ARIMA_order": c1.determine_ARIMA_order(valcol), "MSE": error1,"RMSE": error2,"MAPE":error3, "Accuracy":((1-error3)*100).round(0)}
#         with open('TSA_AQI_model_result_new.json','a') as f:
#             f.write(str(model_results)+',')
#             f.close()
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol)).fit()
#         if len(self.df[valcol]) > 70:
        fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         else:
#             fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         ax = self.df[valcol].plot(legend=True,figsize=(12,6))
#         fcast.plot(legend=True)
        print(fcast)
#         error_rmse = rmse(self.df[valcol],fcast[0:len(self.df)])
#         error_mse = mean_squared_error(self.df[valcol],fcast[0:len(self.df)])
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
#         DF = DF.reset_index()
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()
#         DF_fcast = DF_fcast.rename(columns={'predicted_mean':'Predicted', 'index':'DateTime'})
        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        
#         final_DF.to_json('/Users/syaminiv/Library/CloudStorage/OneDrive-BayesianWaysLLP/Documents/SiAP_ML_Application/Final_Notebooks/June2023/JSON/TSA_withval_Zone_{}.json'.format(item),orient='records')

c1 = TSA(df_daily)
c1.adf_test("CLEAR WATER PUMPING FLOW ML")
c1.determine_ARIMA_order("CLEAR WATER PUMPING FLOW ML")
c1.fit_model("CLEAR WATER PUMPING FLOW ML")
c1.full_data_model("CLEAR WATER PUMPING FLOW ML")

# 1 month mean

In [None]:
# Extract the year and month for grouping
df_daily['YearMonth'] = df_daily.index.to_period('M')
df_daily

In [None]:
# Calculate monthly means only for 'CLEAR WATER PUMPING FLOW ML'
monthly_means = df_daily.groupby('YearMonth')['CLEAR WATER PUMPING FLOW ML'].transform('mean')
monthly_means

In [None]:
df_daily['CLEAR WATER PUMPING FLOW ML'] = df_daily.apply(
    lambda row: monthly_means[row.name] if row['CLEAR WATER PUMPING FLOW ML'] < 40 else row['CLEAR WATER PUMPING FLOW ML'], axis=1
)
df_daily

In [None]:
df_daily.loc['2022-07']

In [None]:
2022-07-04	33.713360
2022-07-06	36.777820
2022-07-07	7.301890
2022-07-08	28.993830
2022-07-14	39.644030
2022-07-15	37.129840
2022-07-16	32.285300
2022-07-18	33.522280
2022-07-19	37.071320
2022-07-20	30.800720
2022-07-21	22.001960
2022-07-27	39.910050

In [None]:
df_daily[df_daily['CLEAR WATER PUMPING FLOW ML']<40]

In [None]:
# mean_value = df_daily['CLEAR WATER PUMPING FLOW ML'].mean()
# # Replace values less than 40 in 'CLEAR WATER PUMPING FLOW ML' with the calculated mean
# df_daily['CLEAR WATER PUMPING FLOW ML'] = df_daily['CLEAR WATER PUMPING FLOW ML'].apply(lambda x: mean_value if x < 40 else x)
# df_daily

In [None]:
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA,ARIMAResults
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse

class TSA:
    def __init__(self,df):
        self.df = df
        
    def adf_test(self,valcol):
        """
        Pass in a time series and an optional title, returns an ADF report
        """
        result = adfuller(self.df[valcol].dropna(),autolag='AIC') # .dropna() handles differenced data
    
        labels = ['ADF test statistic','p-value','# lags used','# observations']
        out = pd.Series(result[0:4],index=labels)

        for key,val in result[4].items():
            out[f'critical value ({key})']=val
        
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
        if result[1] <= 0.05:
            print("Strong evidence against the null hypothesis")
            print("Reject the null hypothesis")
            print("Data has no unit root and is stationary")
            state = "Stationary"
        else:
            print("Weak evidence against the null hypothesis")
            print("Fail to reject the null hypothesis")
            print("Data has a unit root and is non-stationary")
            state = "Non-stationary"
        if state == "Stationary":
            return "Yes"
        else:
            return "No"
    def determine_ARIMA_order(self,valcol):
        stepwise_fit = auto_arima(self.df[valcol], start_p=0, start_q=0,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise
        best_order = stepwise_fit.get_params().get('order')
        print('The best order is {}'.format(best_order))
        return best_order
    def fit_model(self,valcol):
#         if len(self.df[valcol]) > 70:
        train = self.df[valcol][:len(self.df[valcol])-60]#30 test + 30 val =60 train (means except last 60 rows all others taken as train)
        test = self.df[valcol][len(self.df[valcol])-60:len(self.df[valcol])-30] #(60-30=30)second last 30 rows 
        val = self.df[valcol][len(self.df[valcol])-30:] #last 30 rows
#         else:
#             train = self.df[valcol][:len(self.df[valcol])-8]
#             test = self.df[valcol][len(self.df[valcol])-8:len(self.df[valcol])-4]
#             val = self.df[valcol][len(self.df[valcol])-4:]
        start = len(train)
        end = len(train)+len(test)-1
#             print('train : {}'.format(train))
#             print('test : {}'.format(test))
        print('start : {}'.format(start))
        print('end : {}'.format(end))
        results = ARIMA(train,order=c1.determine_ARIMA_order(valcol)).fit()
        predictions = results.predict(start=start, end=end)
        predictions_val = results.predict(start=end+1, end=len(train)+len(test)+len(val)-1)
        error1 = mean_squared_error(test, predictions)
        error2 = rmse(test, predictions)
        error3 = mean_absolute_percentage_error(test,predictions)
        accuracy = (1-error3)*100
        print(f'MSE Error: {error1:11.10}')
        print(f'RMSE Error: {error2:11.10}')
        print(f'MAPE Error: {error3:11.10}')
        print(f'Accuracy: {accuracy:11.10}')
#         return start, len(test)
#         model_results = {"Model":"ARIMA","Stationary": c1.adf_test(valcol),"X_train": str(len(train))+" Weeks", "X_test": str(len(test))+" Weeks", "X_validation": str(len(val))+" Weeks", "ARIMA_order": c1.determine_ARIMA_order(valcol), "MSE": error1,"RMSE": error2,"MAPE":error3, "Accuracy":((1-error3)*100).round(0)}
#         with open('TSA_AQI_model_result_new.json','a') as f:
#             f.write(str(model_results)+',')
#             f.close()
        return predictions_val
        
            
    def full_data_model(self,valcol):
        results = ARIMA(self.df[valcol],order=c1.determine_ARIMA_order(valcol)).fit()
#         if len(self.df[valcol]) > 70:
        fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         else:
#             fcast = results.predict(len(self.df), len(self.df)+3,typ='levels').round(2)
#         ax = self.df[valcol].plot(legend=True,figsize=(12,6))
#         fcast.plot(legend=True)
        print(fcast)
#         error_rmse = rmse(self.df[valcol],fcast[0:len(self.df)])
#         error_mse = mean_squared_error(self.df[valcol],fcast[0:len(self.df)])
        DF = pd.DataFrame(self.df[valcol])
        DF['Type'] = 'Actual'
#         DF = DF.reset_index()
        DF_fcast = pd.DataFrame(fcast)
        DF_fcast['Type'] = 'Predicted'
        DF_fcast = DF_fcast.rename(columns={'predicted_mean':valcol})
        final_DF = pd.concat([DF,DF_fcast])
        final_DF = final_DF.reset_index()
#         DF_fcast = DF_fcast.rename(columns={'predicted_mean':'Predicted', 'index':'DateTime'})
        DF_val = pd.DataFrame(c1.fit_model(valcol))
        DF_val = DF_val.reset_index()
        DF_val = DF_val.rename(columns={'index':'Date','predicted_mean':'Validation'})
        final_DF = final_DF.rename(columns={'index':'Date'})
        print(DF_val)
        print(final_DF)
#         final_DF =  final_DF.merge(DF_val, on='DateTime',how='outer')
        final_DF =  final_DF.merge(DF_val, on='Date',how='outer')
        final_DF['Date'] = final_DF['Date'].astype('str')
        print(final_DF)
        
#         final_DF.to_json('/Users/syaminiv/Library/CloudStorage/OneDrive-BayesianWaysLLP/Documents/SiAP_ML_Application/Final_Notebooks/June2023/JSON/TSA_withval_Zone_{}.json'.format(item),orient='records')

c1 = TSA(df_daily)
c1.adf_test("CLEAR WATER PUMPING FLOW ML")
c1.determine_ARIMA_order("CLEAR WATER PUMPING FLOW ML")
c1.fit_model("CLEAR WATER PUMPING FLOW ML")
c1.full_data_model("CLEAR WATER PUMPING FLOW ML")

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
import pmdarima as pm
import seaborn as sns

def forecast_with_arima(df, column, freq, forecast_periods,m):
    # Resample the data
    df_resampled = df.resample(freq).sum(numeric_only=True)
    
    # Split data into training and testing
    train = df_resampled[column][:-forecast_periods]
    test = df_resampled[column][-forecast_periods:]

    # Train the ARIMA model using auto_arima
    model = pm.auto_arima(df_resampled[column],m=m, seasonal=True, start_p=0, start_q=0, max_order=4, test='adf', trace=True,
                          error_action='ignore', suppress_warnings=True, stepwise=True)

    # Fit the model and make predictions
    results = model.fit(train)
    predictions = results.predict(n_periods=forecast_periods)

    # Calculate performance metrics
    mse_error = mean_squared_error(test, predictions)
    rmse_error = rmse(test, predictions)
    mape_error = mean_absolute_percentage_error(test, predictions)
    accuracy = (1 - mape_error) * 100
    
    # Print the results
    print(f'Frequency: {freq}')
    print(f'MSE Error: {mse_error:11.10}')
    print(f'RMSE Error: {rmse_error:11.10}')
    print(f'MAPE Error: {mape_error:11.10}')
    print(f'Accuracy: {accuracy:11.10}')
    
    # Re-fit the model on the full data and forecast future values
    results_full = model.fit(df_resampled[column])
    forecast = results_full.predict(n_periods=forecast_periods)
    
    # Combine actual and predicted data
    DF_actual = pd.DataFrame(df_resampled[column])
    DF_actual['Type'] = 'Actual'
    DF_forecast = pd.DataFrame(forecast, columns=[column])
    DF_forecast['Type'] = 'Predicted'
    
    final_df = pd.concat([DF_actual, DF_forecast])
    final_df = final_df.reset_index()
    final_df = final_df.rename(columns={'index': 'Date'})
    
    return final_df

# Example of usage
df=pd.read_csv("final_data_in_ML.csv",parse_dates=['Standardized_Date'])
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))
df=df[['DATETIME','CLEAR WATER PUMPING FLOW ML']]
df.set_index('DATETIME', inplace=True)

# Forecast for daily, weekly, and monthly
print("Daily_resampled_data")
daily_results = forecast_with_arima(df, 'CLEAR WATER PUMPING FLOW ML', 'D',4,52)
print("Weekly_resampled_data")
weekly_results = forecast_with_arima(df, 'CLEAR WATER PUMPING FLOW ML', 'W',4,7)
print("Monthly_resampled_data")
monthly_results = forecast_with_arima(df, 'CLEAR WATER PUMPING FLOW ML', 'M',4,12)

# Show the results
print("Daily_resampled_data")
print(daily_results.tail(60))
print("Weekly_resampled_data")
print(weekly_results.tail(60))
print("Monthly_resampled_data")
print(monthly_results.tail(60))


In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from statsmodels.tools.eval_measures import rmse
import pmdarima as pm
import seaborn as sns

def forecast_with_arima(df, column, freq, forecast_periods, m, threshold=40):
    # Resample the data
    df_resampled = df.resample(freq).sum(numeric_only=True)
    
    # Apply threshold logic only for daily frequency
    if freq == 'D':
        # Calculate the mean of the column
        df_resampled['Rolling_Mean'] = df_resampled[column].rolling(window=30, min_periods=1).mean()
        df_resampled[column] = df_resampled.apply(lambda row: row['Rolling_Mean'] if row[column] < threshold else row[column], axis=1)

#         mean_value = df_resampled[column].mean()
#         # Replace values below the threshold with the mean
#         df_resampled[column] = df_resampled[column].apply(lambda x: mean_value if x < threshold else x)
    
    # Split data into training and testing
    train = df_resampled[column][:-forecast_periods]
    test = df_resampled[column][-forecast_periods:]

    # Train the ARIMA model using auto_arima
    model = pm.auto_arima(df_resampled[column], m=m, seasonal=True, start_p=0, start_q=0, max_order=4, test='adf', trace=True,
                          error_action='ignore', suppress_warnings=True, stepwise=True)

    # Fit the model and make predictions
    results = model.fit(train)
    predictions = results.predict(n_periods=forecast_periods)

    # Calculate performance metrics
    mse_error = mean_squared_error(test, predictions)
    rmse_error = rmse(test, predictions)
    mape_error = mean_absolute_percentage_error(test, predictions)
    accuracy = (1 - mape_error) * 100
    
    # Print the results
    print(f'Frequency: {freq}')
    print(f'MSE Error: {mse_error:11.10}')
    print(f'RMSE Error: {rmse_error:11.10}')
    print(f'MAPE Error: {mape_error:11.10}')
    print(f'Accuracy: {accuracy:11.10}')
    
    # Re-fit the model on the full data and forecast future values
    results_full = model.fit(df_resampled[column])
    forecast = results_full.predict(n_periods=forecast_periods)
    
    # Combine actual and predicted data
    DF_actual = pd.DataFrame(df_resampled[column])
    DF_actual['Type'] = 'Actual'
    DF_forecast = pd.DataFrame(forecast, columns=[column])
    DF_forecast['Type'] = 'Predicted'
    
    final_df = pd.concat([DF_actual, DF_forecast])
    final_df = final_df.reset_index()
    final_df = final_df.rename(columns={'index': 'Date'})
    
    return final_df

# Example of usage
df = pd.read_csv("final_data_in_ML.csv", parse_dates=['Standardized_Date'])
df['DATETIME'] = pd.to_datetime(df['Standardized_Date'].astype(str) + ' ' + df['STANDARDIZED_TIME'].astype(str))
df = df[['DATETIME', 'CLEAR WATER PUMPING FLOW ML']]
df.set_index('DATETIME', inplace=True)

# Forecast for daily, weekly, and monthly
print("Daily_resampled_data")
daily_results = forecast_with_arima(df, 'CLEAR WATER PUMPING FLOW ML', 'D', 4, 52)
print("Weekly_resampled_data")
weekly_results = forecast_with_arima(df, 'CLEAR WATER PUMPING FLOW ML', 'W', 4, 7)
print("Monthly_resampled_data")
monthly_results = forecast_with_arima(df, 'CLEAR WATER PUMPING FLOW ML', 'M', 4, 12)

# Show the results
print("Daily_resampled_data")
print(daily_results.tail(60))
print("Weekly_resampled_data")
print(weekly_results.tail(60))
print("Monthly_resampled_data")
print(monthly_results.tail(60))
