In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy.stats import zscore

# Load dataset
df = pd.read_csv("jena_climate_2009_2016.csv")

# Convert "Date Time" to datetime format
df["Date Time"] = pd.to_datetime(df["Date Time"], format="%d.%m.%Y %H:%M:%S")

# Drop duplicate rows if any
df = df.drop_duplicates()

# Fill missing values (forward fill method)
df.fillna(method='ffill', inplace=True)

# Downsample the dataset (taking every 6th row to get hourly readings)
df_downsampled = df.iloc[::6, :].reset_index(drop=True)

# Select key features
selected_features = ["p (mbar)", "T (degC)", "rh (%)", "wv (m/s)"]

# Remove outliers using z-score method
df_downsampled = df_downsampled[(np.abs(zscore(df_downsampled[selected_features])) < 3).all(axis=1)]

# Normalize selected features
scaler = MinMaxScaler()
df_downsampled[selected_features] = scaler.fit_transform(df_downsampled[selected_features])

# Prepare sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, 1])  # Predicting temperature (T (degC))
    return np.array(X), np.array(y)

sequence_length = 144  # Using past 144 hours (~6 days) to predict the next step
data = df_downsampled[selected_features].values
X, y = create_sequences(data, sequence_length)

# Split into training and testing sets (80% train, 20% test)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# Build LSTM Model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(sequence_length, len(selected_features))),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)
])

# Compile Model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train Model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate Model
loss, lstm_mae = model.evaluate(X_test, y_test)
print(f"LSTM Test MAE: {lstm_mae:.4f}")

# ==============================
# BASELINE MODEL COMPARISON
# ==============================

# 1. Moving Average Baseline (Predicts next value as mean of last n values)
y_pred_baseline = np.array([np.mean(y_train[-sequence_length:])] * len(y_test))
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)
print(f"Moving Average Baseline MAE: {baseline_mae:.4f}")

# 2. Linear Regression Baseline
lr_model = LinearRegression()
X_train_lr = np.mean(X_train, axis=1)  # Flatten time-series to single values
X_test_lr = np.mean(X_test, axis=1)

lr_model.fit(X_train_lr, y_train)
y_pred_lr = lr_model.predict(X_test_lr)

lr_mae = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression Baseline MAE: {lr_mae:.4f}")

# ==============================
# COMPARISON RESULTS
# ==============================
print("\nModel Performance Comparison:")
print(f"LSTM Model MAE: {lstm_mae:.4f}")
print(f"Moving Average Baseline MAE: {baseline_mae:.4f}")
print(f"Linear Regression Baseline MAE: {lr_mae:.4f}")

  df.fillna(method='ffill', inplace=True)
  super().__init__(**kwargs)


Epoch 1/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 181ms/step - loss: 0.0134 - mae: 0.0747 - val_loss: 0.0015 - val_mae: 0.0305
Epoch 2/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 190ms/step - loss: 0.0016 - mae: 0.0313 - val_loss: 8.0103e-04 - val_mae: 0.0220
Epoch 3/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 183ms/step - loss: 9.5740e-04 - mae: 0.0236 - val_loss: 0.0010 - val_mae: 0.0246
Epoch 4/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 88ms/step - loss: 7.2916e-04 - mae: 0.0205 - val_loss: 8.4586e-04 - val_mae: 0.0218
Epoch 5/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 80ms/step - loss: 6.1351e-04 - mae: 0.0186 - val_loss: 8.2432e-04 - val_mae: 0.0219
Epoch 6/10
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 79ms/step - loss: 5.4326e-04 - mae: 0.0175 - val_loss: 8.4605e-04 - val_mae: 0.0227
Epoch 7/10
[1m866/866[0m [32m━━━━━━