In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy.stats import zscore

# Load dataset
df = pd.read_csv("jena_climate_2009_2016.csv")

# Convert "Date Time" to datetime format
df["Date Time"] = pd.to_datetime(df["Date Time"], format="%d.%m.%Y %H:%M:%S")

# Drop duplicate rows if any
df = df.drop_duplicates()

# Fill missing values (forward fill method)
df.fillna(method='ffill', inplace=True)

# Downsample the dataset (taking every 6th row to get hourly readings)
df_downsampled = df.iloc[::6, :].reset_index(drop=True)

# Select key features
selected_features = ["p (mbar)", "T (degC)", "rh (%)", "wv (m/s)"]

# Remove outliers using z-score method
df_downsampled = df_downsampled[(np.abs(zscore(df_downsampled[selected_features])) < 3).all(axis=1)]

# Normalize selected features
scaler = MinMaxScaler()
df_downsampled[selected_features] = scaler.fit_transform(df_downsampled[selected_features])

# Prepare sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, 1])  # Predicting temperature (T (degC))
    return np.array(X), np.array(y)

sequence_length = 144  # Using past 144 hours (~6 days) to predict the next step
data = df_downsampled[selected_features].values
X, y = create_sequences(data, sequence_length)

# Split into training and testing sets (80% train, 20% test)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# ==============================
# Build LSTM Model with Bidirectional Layer
# ==============================
model = Sequential([
    Bidirectional(LSTM(100, return_sequences=True, input_shape=(sequence_length, len(selected_features)))),
    Dropout(0.1),
    LSTM(50, return_sequences=False),
    Dropout(0.1),
    Dense(25, activation='relu'),
    Dense(1)
])

# Compile Model
optimizer = Adam(learning_rate=0.001, decay=1e-6)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Define EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_mae',  # Monitor validation MAE (mean absolute error)
    patience=5,          # Stop training if no improvement for 5 epochs
    restore_best_weights=True,  # Restore the best weights
    mode='min'  # Minimize the validation MAE
)

# Train Model with EarlyStopping
history = model.fit(
    X_train, y_train, 
    epochs=20,  # Set a high epoch limit, early stopping will stop it earlier
    batch_size=32,  # Try a smaller batch size for better updates
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]  # Add EarlyStopping callback
)

# ==============================
# Evaluate Model
# ==============================
loss, lstm_mae = model.evaluate(X_test, y_test)
print(f"LSTM Test MAE: {lstm_mae:.4f}")

# ==============================
# BASELINE MODEL COMPARISON
# ==============================

# 1. Moving Average Baseline (Predicts next value as mean of last n values)
y_pred_baseline = np.array([np.mean(y_train[-sequence_length:])] * len(y_test))
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)
print(f"Moving Average Baseline MAE: {baseline_mae:.4f}")

# 2. Linear Regression Baseline
lr_model = LinearRegression()
X_train_lr = np.mean(X_train, axis=1)  # Flatten time-series to single values
X_test_lr = np.mean(X_test, axis=1)

lr_model.fit(X_train_lr, y_train)
y_pred_lr = lr_model.predict(X_test_lr)

lr_mae = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression Baseline MAE: {lr_mae:.4f}")

# ==============================
# COMPARISON RESULTS
# ==============================
print("\nModel Performance Comparison:")
print(f"LSTM Model MAE: {lstm_mae:.4f}")
print(f"Moving Average Baseline MAE: {baseline_mae:.4f}")
print(f"Linear Regression Baseline MAE: {lr_mae:.4f}")


  df.fillna(method='ffill', inplace=True)
  super().__init__(**kwargs)


Epoch 1/20
[1m1731/1731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 131ms/step - loss: 0.0050 - mae: 0.0423 - val_loss: 0.0011 - val_mae: 0.0260
Epoch 2/20
[1m1731/1731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 146ms/step - loss: 6.4566e-04 - mae: 0.0193 - val_loss: 0.0010 - val_mae: 0.0262
Epoch 3/20
[1m1731/1731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 137ms/step - loss: 4.6088e-04 - mae: 0.0161 - val_loss: 4.1746e-04 - val_mae: 0.0155
Epoch 4/20
[1m1731/1731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 153ms/step - loss: 3.7972e-04 - mae: 0.0145 - val_loss: 3.8637e-04 - val_mae: 0.0145
Epoch 5/20
[1m1731/1731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 146ms/step - loss: 3.3757e-04 - mae: 0.0136 - val_loss: 7.2546e-04 - val_mae: 0.0208
Epoch 6/20
[1m1731/1731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 143ms/step - loss: 3.1932e-04 - mae: 0.0131 - val_loss: 0.0011 - val_mae: 0.0267
Epoch 7/20
[1m1731/

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression

# ==============================
# DATA LOADING & PREPROCESSING
# ==============================

# Load dataset
df = pd.read_csv("jena_climate_2009_2016.csv")

# Convert "Date Time" to datetime format
df["Date Time"] = pd.to_datetime(df["Date Time"], format="%d.%m.%Y %H:%M:%S")

# Drop duplicate rows if any
df = df.drop_duplicates()

# Fill missing values (forward fill method)
df.fillna(method='ffill', inplace=True)

# Downsample the dataset (taking every 6th row to get hourly readings)
df_downsampled = df.iloc[::6, :].reset_index(drop=True)

# Select key features
selected_features = ["p (mbar)", "T (degC)", "rh (%)", "wv (m/s)"]

# Remove outliers using z-score method
df_downsampled = df_downsampled[(np.abs(zscore(df_downsampled[selected_features])) < 3).all(axis=1)]

# Normalize selected features
scaler = MinMaxScaler()
df_downsampled[selected_features] = scaler.fit_transform(df_downsampled[selected_features])

# ==============================
# SEQUENCE CREATION FOR RNN
# ==============================

def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, 1])  # Predicting temperature (T (degC))
    return np.array(X), np.array(y)

sequence_length = 144  # Using past 144 hours (~6 days) to predict the next step
data = df_downsampled[selected_features].values
X, y = create_sequences(data, sequence_length)

# Split into training and testing sets (80% train, 20% test)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# ==============================
# BUILDING & TRAINING RNN MODEL
# ==============================

rnn_model = Sequential([
    SimpleRNN(50, return_sequences=True, input_shape=(sequence_length, len(selected_features))),
    Dropout(0.2),
    SimpleRNN(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)
])

# Compile Model
rnn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train Model
history_rnn = rnn_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate Model
y_pred_rnn = rnn_model.predict(X_test)
mae_rnn = mean_absolute_error(y_test, y_pred_rnn)
r2_rnn = r2_score(y_test, y_pred_rnn)

print(f"Test MAE (RNN): {mae_rnn:.4f}")
print(f"R² Score (RNN): {r2_rnn:.4f}")

# ==============================
# BASELINE MODEL COMPARISON
# ==============================

# 1. Moving Average Baseline
y_pred_baseline = np.array([np.mean(y_train[-sequence_length:])] * len(y_test))
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)

# 2. Linear Regression Baseline
lr_model = LinearRegression()
X_train_lr = np.mean(X_train, axis=1)  # Flatten time-series to single values
X_test_lr = np.mean(X_test, axis=1)

lr_model.fit(X_train_lr, y_train)
y_pred_lr = lr_model.predict(X_test_lr)

lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

# ==============================
# COMPARISON RESULTS
# ==============================
print("\nModel Performance Comparison:")
print(f"RNN Model MAE: {mae_rnn:.4f}, R² Score: {r2_rnn:.4f}")
print(f"Moving Average Baseline MAE: {baseline_mae:.4f}")
print(f"Linear Regression Baseline MAE: {lr_mae:.4f}, R² Score: {lr_r2:.4f}")


  df.fillna(method='ffill', inplace=True)
  super().__init__(**kwargs)


Epoch 1/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 41ms/step - loss: 0.0195 - mae: 0.0934 - val_loss: 0.0010 - val_mae: 0.0236
Epoch 2/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 44ms/step - loss: 0.0025 - mae: 0.0387 - val_loss: 0.0023 - val_mae: 0.0421
Epoch 3/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 42ms/step - loss: 0.0016 - mae: 0.0310 - val_loss: 0.0017 - val_mae: 0.0358
Epoch 4/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 43ms/step - loss: 0.0012 - mae: 0.0267 - val_loss: 5.1830e-04 - val_mae: 0.0166
Epoch 5/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 42ms/step - loss: 0.0010 - mae: 0.0240 - val_loss: 4.6816e-04 - val_mae: 0.0166
Epoch 6/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 43ms/step - loss: 9.1649e-04 - mae: 0.0228 - val_loss: 3.3486e-04 - val_mae: 0.0133
Epoch 7/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [1]:
!jupyter nbconvert --to script project_modelbuild.ipynb

[NbConvertApp] Converting notebook project_modelbuild.ipynb to script
[NbConvertApp] Writing 8132 bytes to project_modelbuild.py
