In [None]:

# Cell 1: Install/Import Dependencies

# If you need any additional libraries (for example xgboost or others), install here with pip:
# !pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print(tf.__version__)

In [None]:

# Cell 2: Load CSV & Basic Preprocessing

def load_and_preprocess_data(filepath: str,
                            date_col: str = 'date',
                            index_col: str = 'date',
                            interp_method: str = 'linear'
                            ) -> pd.DataFrame:
    """
    Loads a CSV file containing hydrologic data, parses dates, sets index,
    and handles missing values by interpolation.
    """
    # Read CSV, parse dates
    df = pd.read_csv(filepath, parse_dates=[date_col], index_col=index_col)
    # Interpolate missing values
    df.interpolate(method=interp_method, inplace=True)
    return df

# Example usage: adapt the filename/columns to your data
csv_file = 'your_data.csv'  # Replace with your file path
df = load_and_preprocess_data(
    filepath=csv_file,
    date_col='date',
    index_col='date',
    interp_method='linear'
)

# Quick check on the data
print(df.head())
print(df.isna().sum())

In [None]:

# Cell 3: Scaling & Sequence Creation

from sklearn.preprocessing import MinMaxScaler

def scale_data(df: pd.DataFrame):
    """
    Fits a MinMaxScaler on the DataFrame and returns both the scaler and the scaled DataFrame.
    """
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df),
        index=df.index,
        columns=df.columns
    )
    return scaler, df_scaled

def create_sequences(data: np.ndarray, seq_length: int, target_index: int):
    """
    Converts a 2D array [samples, features] into sequences of shape
    [samples, seq_length, features] for LSTM. The target is the value
    at `target_index` after `seq_length` steps.
    """
    X, y = [], []
    for i in range(len(data) - seq_length):
        X_seq = data[i : i + seq_length, :]
        y_val = data[i + seq_length, target_index]
        X.append(X_seq)
        y.append(y_val)
    return np.array(X), np.array(y)

# 1) Scale the Data
scaler, df_scaled = scale_data(df)

# 2) Define features and target columns
feature_cols = ['precipitation', 'temperature', 'flow']  # Example columns
target_col = 'flow'  # We want to predict 'flow'

seq_len = 14  # e.g., 14 days history
data_array = df_scaled[feature_cols].values
target_index = feature_cols.index(target_col)

X_all, y_all = create_sequences(data_array, seq_length=seq_len, target_index=target_index)

print("Shape of X_all:", X_all.shape)
print("Shape of y_all:", y_all.shape)

In [None]:

# Cell 4: Train/Val/Test Split (Time Series)

n_samples = X_all.shape[0]
train_size = int(n_samples * 0.70)
val_size   = int(n_samples * 0.15)

X_train, y_train = X_all[:train_size], y_all[:train_size]
X_val,   y_val   = X_all[train_size : train_size + val_size], y_all[train_size : train_size + val_size]
X_test,  y_test  = X_all[train_size + val_size:], y_all[train_size + val_size:]

print(f\"Train samples: {X_train.shape[0]}\")
print(f\"Val samples:   {X_val.shape[0]}\")
print(f\"Test samples:  {X_test.shape[0]}\")

In [None]:

# Cell 5: Model Definition

def build_lstm_model(seq_length: int,
                     num_features: int,
                     lstm_units_1: int = 64,
                     lstm_units_2: int = 32,
                     dropout_rate: float = 0.2) -> Sequential:
    """
    Builds and compiles an LSTM-based neural network.
    """
    model = Sequential()
    # First LSTM layer (return_sequences=True to feed the second LSTM)
    model.add(LSTM(lstm_units_1, activation='relu', return_sequences=True,
                   input_shape=(seq_length, num_features)))
    model.add(Dropout(dropout_rate))

    # Second LSTM layer
    model.add(LSTM(lstm_units_2, activation='relu'))
    model.add(Dropout(dropout_rate))

    # Output layer for regression
    model.add(Dense(1))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='mse'  # For regression tasks, MSE is common
    )
    return model

model = build_lstm_model(
    seq_length=seq_len,
    num_features=len(feature_cols),
    lstm_units_1=64,
    lstm_units_2=32,
    dropout_rate=0.2
)

model.summary()

In [None]:

# Cell 6: Model Training

def train_model(model,
                X_train, y_train,
                X_val, y_val,
                epochs: int = 50,
                batch_size: int = 32,
                patience: int = 10,
                model_save_path: str = 'best_model.h5'):
    """
    Trains the LSTM model using early stopping and saves the best model via ModelCheckpoint.
    """
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=patience,
        restore_best_weights=True,
        verbose=1
    )
    checkpoint = ModelCheckpoint(
        filepath=model_save_path,
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop, checkpoint],
        verbose=1
    )

    # Optionally load the best saved weights
    model.load_weights(model_save_path)
    return model, history

model_save_path = 'best_model.h5'  # Name of the file to save the best model
trained_model, history = train_model(
    model,
    X_train, y_train,
    X_val, y_val,
    epochs=50,
    batch_size=32,
    patience=10,
    model_save_path=model_save_path
)

In [None]:

# Cell 7: Evaluation & Visualization

def evaluate_model(model,
                   X_test: np.ndarray,
                   y_test: np.ndarray,
                   label: str = 'Test'):
    """
    Evaluates the model using RMSE, MAE, and Nash-Sutcliffe Efficiency (NSE).
    """
    y_pred = model.predict(X_test).flatten()

    rmse_val = np.sqrt(mean_squared_error(y_test, y_pred))
    mae_val  = mean_absolute_error(y_test, y_pred)

    # Nash–Sutcliffe Efficiency
    numerator = np.sum((y_test - y_pred)**2)
    denominator = np.sum((y_test - np.mean(y_test))**2)
    nse_val = 1 - (numerator / denominator)

    print(f\"[{label} Evaluation]\")
    print(f\"RMSE: {rmse_val:.4f}\")
    print(f\"MAE:  {mae_val:.4f}\")
    print(f\"NSE:  {nse_val:.4f}\")
    return y_pred

# Evaluate on the test set
y_pred_test = evaluate_model(trained_model, X_test, y_test, label='Test')

# Visualize training/validation loss history
plt.figure(figsize=(8,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Training History')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()

# Plot actual vs predicted in the test set (normalized scale)
plt.figure(figsize=(10,5))
plt.plot(y_test, label='Actual (Normalized)', color='black')
plt.plot(y_pred_test, label='Predicted (Normalized)', linestyle='--', color='blue')
plt.title('Normalized Flow - Actual vs. Predicted')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Flow')
plt.legend()
plt.grid(True)
plt.show()

In [None]:

# Cell 8: Inverse Scale Predictions (Flow in original units)

# Reconstruct arrays so that we can apply 'scaler.inverse_transform'
# We'll fill other feature columns with 0, only place predictions in the flow column.

def inverse_transform_predictions(scaler, y_pred, y_actual, target_index, feature_cols):
    dummy_pred = np.zeros((len(y_pred), len(feature_cols)))
    dummy_pred[:, target_index] = y_pred
    inv_pred = scaler.inverse_transform(dummy_pred)[:, target_index]

    dummy_actual = np.zeros((len(y_actual), len(feature_cols)))
    dummy_actual[:, target_index] = y_actual
    inv_actual = scaler.inverse_transform(dummy_actual)[:, target_index]

    return inv_pred, inv_actual

inv_pred, inv_actual = inverse_transform_predictions(
    scaler,
    y_pred_test,
    y_test,
    target_index,
    feature_cols
)

# Plot actual vs predicted flow in original units
plt.figure(figsize=(10,5))
plt.plot(inv_actual, label='Actual Flow', color='black')
plt.plot(inv_pred, label='Predicted Flow', color='blue', linestyle='--')
plt.title('Actual vs. Predicted Flow (Original Units)')
plt.xlabel('Sample Index')
plt.ylabel('Flow (Original Units)')
plt.legend()
plt.grid(True)
plt.show()