
# DEEP NEURAL NETWORKS – ASSIGNMENT 3  
## RNN vs TRANSFORMER FOR TIME SERIES PREDICTION



**BITS ID:** 2025AA05421  
**Name:** Sagar Ganpati Powar  
**Email:** 2025aa05421@wilp.bits-pilani.ac.in  
**Date:** 07-02-2026


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import json
import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers Amaz(Dense, LSTM, GRU, Input,
    MultiHeadAttention, LayerNormalization,
    GlobalAveragePooling1D
)
from tensorflow.keras.optimizers import Adam

SyntaxError: invalid syntax (ipython-input-4201011217.py, line 14)

## PART 1: DATASET LOADING AND EXPLORATION

### 1.1 Dataset Selection and Loading

In [None]:
# load dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/2016-weather-data-seattle.csv"
df = pd.read_csv(url)
df.head()

In [None]:
print(df.columns)
print("Original Data shape:", df.shape)

In [None]:
# select features & subset
data = df[['Mean_TemperatureC']].dropna().values
data = data[:1500]
print("Subset records :", len(data))

In [None]:
dataset_name = "Seattle Weather 2016"
dataset_source = url
n_samples = len(data)
n_features = 1
problem_type = "time_series_forecasting"

# ===============================
# Hyperparameters
# ===============================
sequence_length = 30        # Lookback window (10–50)
prediction_horizon = 1      # Steps ahead to predict (1–10)

In [None]:
# Primary metric selection
primary_metric = "RMSE"
metric_justification = "RMSE penalizes larger temperature prediction errors."

In [None]:
print("\n" + "="*70)
print("DATASET INFORMATION")
print("="*70)
print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Total Samples: {n_samples}")
print(f"Number of Features: {n_features}")
print(f"Sequence Length: {sequence_length}")
print(f"Prediction Horizon: {prediction_horizon}")
print(f"Primary Metric: {primary_metric}")
print(f"Metric Justification: {metric_justification}")
print("="*70)


### 1.2 Time Series Exploration

#### 1.2.1 Plot Time Series Data

In [None]:
# 1. Plot Time Series Data
plt.figure(figsize=(10, 4))
plt.plot(data, label="Mean Temperature (°C)")
plt.title("Seattle Mean Temperature Time Series (First 1500 Samples)")
plt.xlabel("Time Steps (Days)")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.show()


#### 1.2.2 Check for Trend and Seasonality

In [None]:
# 2. Check for Trend and Seasonality
series = pd.Series(data.flatten())

rolling_mean = series.rolling(window=30).mean()
rolling_std = series.rolling(window=30).std()

plt.figure(figsize=(10, 4))
plt.plot(series, label="Original Series")
plt.plot(rolling_mean, label="30-Day Rolling Mean", linewidth=2)
plt.plot(rolling_std, label="30-Day Rolling Std", linewidth=2)
plt.title("Trend and Variability Analysis (Rolling Statistics)")
plt.xlabel("Time Steps (Days)")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.show()



#### 1.2.3 Stationarity Test

In [None]:
from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(series)

print("ADF Statistic:", adf_result[0])
print("p-value:", adf_result[1])
print("Critical Values:")
for key, value in adf_result[4].items():
    print(f"{key}: {value}")


### 1.3 Data Preprocessing

In [None]:
def preprocess_timeseries(data):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    return data_scaled, scaler

def create_sequences(data, seq_length, pred_horizon):
    X, y = [], []
    for i in range(len(data) - seq_length - pred_horizon):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length:i+seq_length+pred_horizon])
    return np.array(X), np.array(y)


In [None]:
# Temporal train/test split (NO SHUFFLING)
data_scaled, scaler = preprocess_timeseries(data)
X, y = create_sequences(data_scaled, sequence_length, prediction_horizon)

split = int(len(X) * 0.9)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [None]:
# Temporal train/test split (NO SHUFFLING)
train_test_ratio = "90/10"
train_samples = len(X_train)
test_samples = len(X_test)

In [None]:
print(f"\nTrain/Test Split: {train_test_ratio}")
print(f"Training Samples: {train_samples}")
print(f"Test Samples: {test_samples}")
print("⚠️  IMPORTANT: Temporal split used (NO shuffling)")

## PART 2 : LSTM Model

### 2.1 Architecture Design

In [None]:
# LSTM/GRU Architecture Design
def build_rnn_model(model_type, input_shape, hidden_units, n_layers, output_size):
    model = Sequential()
    for i in range(n_layers):
        return_seq = i < n_layers - 1
        if model_type == "LSTM":
            model.add(LSTM(hidden_units, return_sequences=return_seq,
                           input_shape=input_shape if i == 0 else None))
        else:
            model.add(GRU(hidden_units, return_sequences=return_seq,
                          input_shape=input_shape if i == 0 else None))
    model.add(Dense(output_size))
    return model


In [None]:
# Create LSTM model
lstm_model = build_rnn_model("LSTM", (sequence_length, n_features), 64, 2, prediction_horizon)

# Compile LSTM model
lstm_model.compile(optimizer=Adam(0.001), loss="mse")

lstm_model.summary()

### 2.2 Train LSTM Model

In [None]:
print("\n" + "="*70)
print("RNN MODEL TRAINING")
print("="*70)

In [None]:
# Track training time
rnn_start_time = time.time()

# Train
hist_lstm = lstm_model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)
rnn_training_time = time.time() - rnn_start_time

In [None]:
rnn_initial_loss = hist_lstm.history['loss'][0]
rnn_final_loss = hist_lstm.history['loss'][-1]

In [None]:
print(f"Training completed in {rnn_training_time:.2f} seconds")
print(f"Initial Loss: {rnn_initial_loss:.4f}")
print(f"Final Loss: {rnn_final_loss:.4f}")
print("="*70)

### 2.3 Evaluate RNN Model

In [None]:
# Make LSTM predictions on test set
y_pred_lstm = lstm_model.predict(X_test)

# Inverse transform (since data was normalized)
y_test_inv = scaler.inverse_transform(
    y_test.reshape(-1, 1)
).flatten()

y_pred_lstm_inv = scaler.inverse_transform(
    y_pred_lstm.reshape(-1, 1)
).flatten()

def calculate_mape(y_true, y_pred, threshold=1.0):
    """
    MAPE calculated only where |y_true| >= threshold
    This avoids instability near zero.
    """
    mask = np.abs(y_true) >= threshold
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# Calculate all 4 metrics
rnn_mae = mean_absolute_error(y_test_inv, y_pred_lstm_inv)
rnn_rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_lstm_inv))
rnn_mape = calculate_mape(y_test_inv, y_pred_lstm_inv)
rnn_r2 = r2_score(y_test_inv, y_pred_lstm_inv)

In [None]:
print("\nRNN Model Performance:")
print(f"MAE:   {rnn_mae:.4f}")
print(f"RMSE:  {rnn_rmse:.4f}")
print(f"MAPE:  {rnn_mape:.4f}%")
print(f"R² Score: {rnn_r2:.4f}")

## PART 3: TRANSFORMER IMPLEMENTATION

### 3.1 Positional Encoding Implementation

In [None]:
def positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = pos * angle_rates

    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    return tf.cast(angle_rads, dtype=tf.float32)

### 3.2 Transformer Encoder Architecture

In [None]:
from tensorflow.keras import layers, Model

def build_transformer_model(
    seq_length,
    n_features,
    d_model,
    n_heads,
    n_layers,
    d_ff,
    output_size
):
    inputs = layers.Input(shape=(seq_length, n_features))

    # Project input to d_model
    x = layers.Dense(d_model)(inputs)

    # Add positional encoding
    x = x + positional_encoding(seq_length, d_model)

    # Stack Transformer encoder layers
    for _ in range(n_layers):
        # Multi-head self-attention
        attn_output = layers.MultiHeadAttention(
            num_heads=n_heads,
            key_dim=d_model // n_heads
        )(x, x)
        x = layers.LayerNormalization()(x + attn_output)

        # Feed-forward network
        ffn_output = layers.Dense(d_ff, activation="relu")(x)
        ffn_output = layers.Dense(d_model)(ffn_output)
        x = layers.LayerNormalization()(x + ffn_output)

    # Output layer
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(output_size)(x)

    return Model(inputs=inputs, outputs=outputs)


### 3.3 Build Transformer Model

In [None]:
transformer_model = build_transformer_model(sequence_length, n_features, d_model=64, n_heads=4, n_layers=2, d_ff=256, output_size=prediction_horizon)

transformer_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="mse"
)

transformer_model.summary()

### 3.4 Train Transformer Model

In [None]:
print("\n" + "="*70)
print("TRANSFORMER MODEL TRAINING")
print("="*70)

In [None]:
# Track training time
transformer_start_time = time.time()

hist_transformer = transformer_model.fit(X_train, y_train, epochs=50, batch_size=32)

transformer_training_time = time.time() - transformer_start_time

In [None]:
# Track initial and final loss
transformer_initial_loss = hist_transformer.history['loss'][0]
transformer_final_loss = hist_transformer.history['loss'][-1]

In [None]:
print(f"Training completed in {transformer_training_time:.2f} seconds")
print(f"Initial Loss: {transformer_initial_loss:.4f}")
print(f"Final Loss: {transformer_final_loss:.4f}")
print("="*70)

### 3.5 Evaluate Transformer Model

In [None]:
# Make transaformer predictions on test set
y_pred_tr = transformer_model.predict(X_test)

# Inverse transform predictions and true values
y_test_tr_inv = scaler.inverse_transform(
    y_test.reshape(-1, 1)
).flatten()

y_pred_tr_inv = scaler.inverse_transform(
    y_pred_tr.reshape(-1, 1)
).flatten()

# Calculate all 4 metrics (Transformer)
transformer_mae = mean_absolute_error(y_test_tr_inv, y_pred_tr_inv)
transformer_rmse = np.sqrt(mean_squared_error(y_test_tr_inv, y_pred_tr_inv))
transformer_mape = calculate_mape(y_test_tr_inv, y_pred_tr_inv)
transformer_r2 = r2_score(y_test_tr_inv, y_pred_tr_inv)

In [None]:
print("\nTransformer Model Performance:")
print(f"MAE:   {transformer_mae:.4f}")
print(f"RMSE:  {transformer_rmse:.4f}")
print(f"MAPE:  {transformer_mape:.4f}%")
print(f"R² Score: {transformer_r2:.4f}")

### 3.6 Visualize Transformer Results

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(hist_transformer.history['loss'], label='Training Loss')
plt.title("Transformer Training Loss")
plt.xlabel("Epochs")
plt.ylabel("MSE Loss")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10, 4))
plt.plot(y_test_tr_inv, label="Actual")
plt.plot(y_pred_tr_inv, label="Predicted")
plt.title("Transformer: Actual vs Predicted Temperature")
plt.xlabel("Time Steps")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.show()


In [None]:
residuals_tr = y_test_tr_inv - y_pred_tr_inv

plt.figure(figsize=(8, 4))
plt.plot(residuals_tr)
plt.title("Transformer Residuals")
plt.xlabel("Time Steps")
plt.ylabel("Error (°C)")
plt.axhline(0, color='red', linestyle='--')
plt.show()


## PART 4: MODEL COMPARISON AND VISUALIZATION

### 4.1 Metrics Comparison

In [None]:
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)

In [None]:
# Total trainable parameters
rnn_total_params = lstm_model.count_params()
transformer_total_params = transformer_model.count_params()

comparison_df = pd.DataFrame({
    'Metric': ['MAE', 'RMSE', 'MAPE (%)', 'R² Score', 'Training Time (s)', 'Parameters'],
    'RNN (LSTM/GRU)': [
        rnn_mae,
        rnn_rmse,
        rnn_mape,
        rnn_r2,
        rnn_training_time,
        rnn_total_params
    ],
    'Transformer': [
        transformer_mae,
        transformer_rmse,
        transformer_mape,
        transformer_r2,
        transformer_training_time,
        transformer_total_params
    ]
})

In [None]:
print(comparison_df.to_string(index=False))
print("="*70)

### 4.2 Visual Comparison

In [None]:
metrics = ['MAE', 'RMSE', 'R²']
lstm_values = [rnn_mae, rnn_rmse, rnn_r2]
transformer_values = [ transformer_mae,  transformer_rmse,  transformer_r2]

x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(8, 4))
plt.bar(x - width/2, lstm_values, width, label='LSTM')
plt.bar(x + width/2, transformer_values, width, label='Transformer')

plt.xticks(x, metrics)
plt.ylabel("Metric Value")
plt.title("Model Performance Comparison")
plt.legend()
plt.grid(axis='y')
plt.show()


In [None]:
plt.figure(figsize=(10, 4))

plt.plot(y_test_inv, label='Actual', color='black')
plt.plot(y_pred_lstm_inv, label='LSTM Prediction')
plt.plot(y_pred_tr_inv, label='Transformer Prediction')

plt.title("Actual vs Predictions (LSTM vs Transformer)")
plt.xlabel("Time Steps")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8, 4))

plt.plot(hist_lstm.history['loss'], label='LSTM Training Loss')
plt.plot(hist_transformer.history['loss'], label='Transformer Training Loss')

plt.title("Training Loss Comparison")
plt.xlabel("Epochs")
plt.ylabel("MSE Loss")
plt.legend()
plt.grid(True)
plt.show()


## PART 5: ANALYSIS

In [None]:
analysis_text = """
TODO: Write your analysis here (maximum 200 words guideline)

Address:
1. Which model performed better and by how much?
   [Compare specific metrics]

2. RNN vs Transformer architecture advantages?
   [Discuss sequential processing vs parallel processing]

3. Impact of attention mechanism?
   [Discuss how attention captures dependencies]

4. Long-term dependency handling?
   [Compare vanishing gradients vs attention]

5. Computational cost comparison?
   [Compare training time, parameters]

6. Convergence behavior?
   [Discuss training stability, loss curves]
"""

In [None]:
#  Print analysis with word count
print("\n" + "="*70)
print("ANALYSIS")
print("="*70)
print(analysis_text)
print("="*70)
print(f"Analysis word count: {len(analysis_text.split())} words")
if len(analysis_text.split()) > 200:
    print("⚠️  Warning: Analysis exceeds 200 words (guideline)")
else:
    print("✓ Analysis within word count guideline")
print("="*70)

## PART 6: ASSIGNMENT RESULTS SUMMARY

In [None]:
def get_assignment_results():
    """
    Generate complete assignment results in required format

    Returns:
        dict: Complete results with all required fields
    """

    framework_used = "keras"
    rnn_model_type = "LSTM"

    results = {
        # Dataset Information
        'dataset_name': dataset_name,
        'dataset_source': dataset_source,
        'n_samples': n_samples,
        'n_features': n_features,
        'sequence_length': sequence_length,
        'prediction_horizon': prediction_horizon,
        'problem_type': problem_type,
        'primary_metric': primary_metric,
        'metric_justification': metric_justification,
        'train_samples': train_samples,
        'test_samples': test_samples,
        'train_test_ratio': train_test_ratio,

        # RNN Model Results
        'rnn_model': {
            'framework': framework_used,
            'model_type': rnn_model_type,
            'architecture': {
                'n_layers': 2,
                'hidden_units': 64,
                'total_parameters': rnn_total_params
            },
            'training_config': {
                'learning_rate': 0.001,
                'n_epochs': 50,
                'batch_size': 32,
                'optimizer': 'Adam',
                'loss_function': 'MSE'
            },
            'initial_loss': rnn_initial_loss,
            'final_loss': rnn_final_loss,
            'training_time_seconds': rnn_training_time,
            'mae': rnn_mae,
            'rmse': rnn_rmse,
            'mape': rnn_mape,
            'r2_score': rnn_r2
        },

        # Transformer Model Results
        'transformer_model': {
            'framework': framework_used,
            'architecture': {
                'n_layers': 2,
                'n_heads': 4,
                'd_model': 64,
                'd_ff': 256,
                'has_positional_encoding': True,
                'has_attention': True,
                'total_parameters': transformer_total_params
            },
            'training_config': {
                'learning_rate': 0.001,
                'n_epochs': 50,
                'batch_size': 32,
                'optimizer': 'Adam',
                'loss_function': 'MSE'
            },
            'initial_loss': transformer_initial_loss,
            'final_loss': transformer_final_loss,
            'training_time_seconds': transformer_training_time,
            'mae': transformer_mae,
            'rmse': transformer_rmse,
            'mape': transformer_mape,
            'r2_score': transformer_r2
        },

        # Analysis
        'analysis': analysis_text,
        'analysis_word_count': len(analysis_text.split()),

        # Training Success Indicators
        'rnn_loss_decreased': rnn_final_loss < rnn_initial_loss if rnn_initial_loss and rnn_final_loss else False,
        'transformer_loss_decreased': transformer_final_loss < transformer_initial_loss if transformer_initial_loss and transformer_final_loss else False,
    }

    return results

In [None]:
# Generate and print results
try:
    assignment_results = get_assignment_results()

    print("\n" + "="*70)
    print("ASSIGNMENT RESULTS SUMMARY")
    print("="*70)
    print(json.dumps(assignment_results, indent=2))
    print("="*70)

except Exception as e:
    print(f"\n⚠️  ERROR generating results: {str(e)}")
    print("Please ensure all variables are properly defined")

In [None]:
# Display system information
import platform
import sys
from datetime import datetime

In [None]:
# Environment Details
env_info = {
    "python_version": sys.version,
    "tensorflow_version": tf.__version__,
    "platform": platform.platform()
}

In [None]:
print("="*70)
print("ENVIRONMENT INFORMATION")
print(json.dumps(env_info, indent=2))
print("="*70)
print("\n⚠️  REQUIRED: Add screenshot of your Google Colab/BITS Virtual Lab")
print("showing your account details in the cell below this one.")
print("="*70)