# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from scipy.stats import boxcox

In [2]:
# Read the dataset
cleaned_df = pd.read_csv('/Users/priyakundu/Documents/NYU Capstone WaterVue Files/Data Processing/Datasets/Ultimate_Dataframe_WaterQual.csv')
cleaned_df

Unnamed: 0,Location,Sample Date,Chlorophyll A,Dissolved Oxygen,Salinity,Specific Conductance,Total Nitrogen,Total Phosphorus,Turbidity
0,#1 HILLSBORO CANAL US 1,2006-02-28,12.100000,6.980000,19.400000,31300.000000,0.830000,0.086000,2.400000
1,#1 HILLSBORO CANAL US 1,2006-08-31,4.430000,5.540000,15.750000,26000.000000,0.981000,0.109000,1.400000
2,#1 HILLSBORO CANAL US 1,2007-02-28,2.605000,6.195000,23.200000,36700.000000,0.754000,0.083500,1.650000
3,#1 HILLSBORO CANAL US 1,2007-08-31,15.995000,6.015000,19.790000,31400.000000,0.978500,0.102000,2.250000
4,#1 HILLSBORO CANAL US 1,2008-02-29,5.925000,5.770000,12.500000,21050.000000,1.440000,0.102000,2.050000
...,...,...,...,...,...,...,...,...,...
1315,#89 NOB HILL RD POMPANO CANAL,2020-02-29,3.933333,5.703333,0.310000,642.666667,1.097133,0.014000,0.750000
1316,#89 NOB HILL RD POMPANO CANAL,2020-08-31,1.570000,7.760000,0.250000,526.000000,0.910000,0.011000,0.000000
1317,#89 NOB HILL RD POMPANO CANAL,2021-02-28,2.966667,5.833333,0.233333,482.000000,0.984033,0.003667,0.566667
1318,#89 NOB HILL RD POMPANO CANAL,2021-08-31,5.510000,5.370000,0.270000,558.000000,1.410500,0.049000,0.775000


## Splitting

In [3]:
sorted_df = cleaned_df.sort_values(by='Sample Date', ascending=True)

In [4]:
# Split data into train and test sets
train_data = sorted_df.iloc[:-3*cleaned_df["Location"].nunique()].sort_values(by=['Location', 'Sample Date'], ascending=True)  # Use all but the last 12 months for training
test_data = sorted_df.iloc[-3*cleaned_df["Location"].nunique():].sort_values(by=['Location', 'Sample Date'], ascending=True)   # Use the last 12 months for testing

# Forecasting

## Case 1 - Trying 'Additive' as the hyperparameter

In [5]:
attributes = ['Chlorophyll A', 'Dissolved Oxygen', 'Salinity', 'Specific Conductance', 'Total Nitrogen', 'Total Phosphorus', 'Turbidity']

In [6]:
# Initialize dictionary to store percentage error values for each attribute
att_ADD_percentage_error = {attribute: [] for attribute in attributes}

# Initialize list to store locations
locations = []

for location in train_data["Location"].unique():
    
    # Store location for each iteration
    locations.append(location)
    
    # Filter data for current location
    train_df = train_data[train_data["Location"] == location].set_index("Sample Date")
    train_df.index = pd.to_datetime(train_df.index)  # Convert index to DateTimeIndex
    test_df = test_data[test_data["Location"]==location]

    for attribute in attributes:
        # Fit ETS model 
        trend = 'additive'  # or 'multiplicative'
        seasonal_periods = 12  # Assuming half yearly data
        model = ExponentialSmoothing(train_df[attribute], trend=trend, seasonal='add', seasonal_periods=seasonal_periods)
        result = model.fit()

        # Forecast future values
        forecast = result.forecast(steps=3)  # Forecasting next 3 periods into the future
        forecast_index = pd.date_range(start=train_df.index[-1], periods=4, freq='6ME')[1:]

        # Convert test and forecast values to NumPy arrays
        test_values = np.array(test_df[attribute])
        forecast_values = np.array(forecast)

        # Calculate percentage error
        percentage_error = (test_values - forecast_values) / test_values
        att_ADD_percentage_error[attribute].append(np.mean(percentage_error))  # Storing mean percentage error

# Create DataFrame from att_ADD_percentage_error dictionary
final_add_error_df = pd.DataFrame(att_ADD_percentage_error)

# Add 'Location' column to the DataFrame
final_add_error_df['Location'] = locations

# Reorder columns
final_add_error_df = final_add_error_df[['Location'] + attributes]

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

In [7]:
# Print the final results
final_add_error_df

Unnamed: 0,Location,Chlorophyll A,Dissolved Oxygen,Salinity,Specific Conductance,Total Nitrogen,Total Phosphorus,Turbidity
0,#1 HILLSBORO CANAL US 1,0.386003,0.054176,-0.285631,-0.495912,-0.19202,0.1156,-0.423868
1,#10 MIDDLE RIVER E SUNRISE,-0.551797,0.07925,-0.174205,-0.037106,-0.004934,-0.25258,-2.806117
2,#11 MIDDLE RIVER NW 21ST AVE,-0.966445,0.178165,-4.434345,-4.151832,-0.300885,-0.664569,-0.084211
3,#110 POMPANO CANAL AT DIXIE AN,0.683774,0.183682,-0.199493,-0.202982,0.09869,-0.490506,0.266515
4,#111 S. FORK MID R. @ N.E. 15,0.811976,0.211401,-3.044695,-2.899949,0.105011,-0.315378,-0.279013
5,#112 N. FORK MID R. @ N.E. 16,-0.75048,0.176799,-4.334864,-3.51906,0.138193,-0.038974,-0.382001
6,#12 MIDDLE RIVER NW 31ST AVE,-0.164215,0.01464,-0.13075,-0.156107,-0.009262,-1.233946,0.527768
7,#14 MIDDLE RIVER UNIVERS. DRV,-0.056655,0.143148,-0.114638,-0.127858,-0.110751,-1.921988,0.056638
8,#15 NEW RIVER ANDREWS AVE,-0.654741,0.185806,-0.126039,-0.141147,-0.007476,-0.105323,0.038024
9,#16 NORTH FORK BROWARD BLVD,-1.65555,0.188976,-6.76388,-3.704419,0.420423,-0.032401,-0.17651


## Case 2 - Trying 'Multiplicative' as the hyperparameter

In [8]:
# Initialize dictionary to store RMSE values for each attribute
att_MUL_percentage_error = {attribute: [] for attribute in attributes}

# Initialize list to store locations
locations = []

for location in train_data["Location"].unique():
    
    # Store location for each iteration
    locations.append(location)
    
    # Filter data for current location
    train_df = train_data[train_data["Location"] == location].set_index("Sample Date")
    train_df.index = pd.to_datetime(train_df.index)  # Convert index to DateTimeIndex
    test_df = test_data[test_data["Location"]==location]

    for attribute in attributes:
        # Shift the data to ensure positivity
        min_val = train_df[attribute].min()
        if min_val <= 0:
            train_df[attribute] = train_df[attribute] - min_val + 1  # Shift and add 1 to ensure positivity
            test_df[attribute] = test_df[attribute] - min_val + 1

        # Fit ETS model 
        trend = 'multiplicative'  # Case 2
        seasonal_periods = 12  # Assuming half yearly data
        model = ExponentialSmoothing(train_df[attribute], trend=trend, seasonal='mul', seasonal_periods=seasonal_periods)
        result = model.fit()

        # Forecast future values
        forecast = result.forecast(steps=3)  # Forecasting next 3 periods into the future
        forecast_index = pd.date_range(start=train_df.index[-1], periods=4, freq='6ME')[1:]

        # Convert test and forecast values to NumPy arrays
        test_values = np.array(test_df[attribute])
        forecast_values = np.array(forecast)

        # Calculate percentage error
        percentage_error = (test_values - forecast_values) / test_values
        att_MUL_percentage_error[attribute].append(np.mean(percentage_error))  # Storing mean percentage error

# Create DataFrame from att_MUL_percentage_error dictionary
final_mul_error_df = pd.DataFrame(att_MUL_percentage_error)

# Add 'Location' column to the DataFrame
final_mul_error_df['Location'] = locations

# Reorder columns
final_mul_error_df = final_mul_error_df[['Location'] + attributes]

  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return err.T @ err
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._ini

In [9]:
# Print the final df
final_mul_error_df

Unnamed: 0,Location,Chlorophyll A,Dissolved Oxygen,Salinity,Specific Conductance,Total Nitrogen,Total Phosphorus,Turbidity
0,#1 HILLSBORO CANAL US 1,0.326884,0.044437,-0.387223,-3.770149e+23,-0.252254,0.059865,-0.038681
1,#10 MIDDLE RIVER E SUNRISE,-0.446806,0.069082,-0.23138,-0.05838216,-0.078661,-0.57609,-1.971499
2,#11 MIDDLE RIVER NW 21ST AVE,-1.293459,0.16531,-4.585412,-4.476451,-0.306001,-0.798302,-0.090231
3,#110 POMPANO CANAL AT DIXIE AN,0.37266,0.14519,0.22036,-0.2992427,0.111893,-0.448304,0.217827
4,#111 S. FORK MID R. @ N.E. 15,-0.678244,0.178136,-3.408449,-2.370879,0.077536,-0.588842,-0.373857
5,#112 N. FORK MID R. @ N.E. 16,-0.839147,0.170452,-5.085011,-3.565078,0.096118,-0.235681,-0.461247
6,#12 MIDDLE RIVER NW 31ST AVE,0.407767,0.002316,-0.130772,-0.1550886,-0.027315,-0.012015,0.451464
7,#14 MIDDLE RIVER UNIVERS. DRV,-0.303275,0.118957,-0.115577,-0.1273073,-0.109658,-0.003286,-0.187893
8,#15 NEW RIVER ANDREWS AVE,-1.724626,0.111752,-0.124604,-0.4387539,-0.700022,0.044153,-0.013355
9,#16 NORTH FORK BROWARD BLVD,-0.74332,0.114832,-6.612136,-4.129801,0.292743,-0.060252,-0.315747
