In [None]:
!pip install tensorflow tensorrt

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [7]:
# for inputing csv file as the data, skip this block if the dataframe is already loaded
df = pd.read_csv('df_data_filter.csv')
df = df.drop(columns=['Unnamed: 0'])
wq = ['BOD', 'AmNi', 'Chla', 'DO', 'Ecoli', 'FC', 'NitraNi', 'NitriNi', 'OrPh', 'pH', 'Sal', 'SDD', 'Si', 'SS', 'Temp', 'TIN', 'TKN', 'ToNi', 'ToPh', 'Tur', 'UnAm', 'VSS']

In [8]:
# Define train and test datasets

df_train = df[df['Image_Year'] <= 2019].drop(columns=['Image_Year']).copy()
df_test = df[df['Image_Year'] == 2020].drop(columns=['Image_Year']).copy()
X_train = df_train.drop(columns = wq)
X_test = df_test.drop(columns = wq)

std_scaler = StandardScaler()
std_scaler.set_output(transform='pandas')

# Transform the data (centering and scaling features)
df_train_scaled = std_scaler.fit_transform(df_train)
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

df_scaled = std_scaler.fit_transform(df)

In [None]:
def LSTM_TimeSeries(df, wq_name, first, seed, n_splits=5, epochs=50, batch_size=32):
    
    print(wq_name, "Started")
    print("nvar", first)
    print("Seed", seed)
    
    # Define the TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    LSTM_results = []

    for train_index, test_index in tscv.split(df_scaled):
        df_train, df_test = df_scaled.iloc[train_index], df_scaled.iloc[test_index]
        X_train, X_test = df_train.drop(columns=[wq_name]), df_test.drop(columns=[wq_name])
        Y_train, Y_test = df_train[wq_name], df_test[wq_name]
        print("\n--------------------------------------")
        print(f'Training split: {train_index[0]} to {train_index[-1]}, Testing split: {test_index[0]} to {test_index[-1]}')
        
        # Correlation-based feature selection
        c = df_train.corr().copy()
        c = c[wq_name][22:113]
        c = abs(c).sort_values(ascending=False)[0:first]
        var = c.index.tolist()
        
        X_train2 = X_train[var]
        X_test2 = X_test[var]
        
        # Reshape data for LSTM [samples, time steps, features]
        X_train2 = np.expand_dims(X_train2, axis=1)
        X_test2 = np.expand_dims(X_test2, axis=1)
        
        model = Sequential()
        model.add(LSTM(50, activation='relu', input_shape=(X_train2.shape[1], X_train2.shape[2])))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')

        # Early stopping to avoid overfitting
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

        history = model.fit(X_train2, Y_train, epochs=epochs, batch_size=batch_size, 
                            validation_split=0.2, verbose=1, callbacks=[es])
        
        # Evaluate model on training data
        Y_train_pred = model.predict(X_train2).flatten()
        Y_train_pred[Y_train_pred < 0] = 0.0
        r_squared = model.evaluate(X_train2, Y_train, verbose=0)
        rmse = mean_squared_error(Y_train, Y_train_pred, squared=False)
        mae = mean_absolute_error(Y_train, Y_train_pred)
        smape = np.mean(2 * (np.abs(Y_train_pred - Y_train)) / (np.abs(Y_train) + np.abs(Y_train_pred)))
        
        print(f'R2: {r_squared}, RMSE: {rmse}, MAE: {mae}, SMAPE: {smape}')

        # Test model on testing data
        Y_test_pred = model.predict(X_test2).flatten()
        Y_test_pred[Y_test_pred < 0] = 0.0
        r_squared_test = model.evaluate(X_test2, Y_test, verbose=0)
        rmse_test = mean_squared_error(Y_test, Y_test_pred, squared=False)
        mae_test = mean_absolute_error(Y_test, Y_test_pred)
        smape_test = np.mean(2 * (np.abs(Y_test_pred - Y_test)) / (np.abs(Y_test) + np.abs(Y_test_pred)))
        
        print(f'R2 Test: {r_squared_test}, RMSE Test: {rmse_test}, MAE Test: {mae_test}, SMAPE Test: {smape_test}')

        LSTM_df = pd.DataFrame({
            'WQ': [wq_name], 'nvar': [len(var)], 'var': [var], 'random_state': [seed],
            'r2': [r_squared], 'rmse': [rmse], 'mae': [mae], 'smape': [smape],
            'r2_test': [r_squared_test], 'rmse_test': [rmse_test], 'mae_test': [mae_test], 'smape_test': [smape_test]
        })

        LSTM_results.append(LSTM_df)
        print("--------------------------------------\n")
        

    print(wq_name, "Finished")
    
    # Combine all results into a single DataFrame
    return pd.concat(LSTM_results, ignore_index=True)

wq = ['Chla', 'SS', 'Tur']
LSTM_result_list = [LSTM_TimeSeries(df_scaled, wq_name=value, first=f, seed=1) for value in wq for f in range(4, 13)]
LSTM_result = pd.concat(LSTM_result_list)

LSTM_result

In [None]:
LSTM_result.to_csv("LSTM_result_list.csv")