### Model 3: Attention-based Neural Network

### Final Model

In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, MultiHeadAttention, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [33]:
# Build feature + target
X = model_data.drop(columns=['Severity'])
y = model_data['Severity']

# Drop datetime
datetime_cols = X.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns
X = X.drop(columns=datetime_cols)

# Identify numeric / categorical / boolean
numeric = [
    'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
    'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
    'Precipitation(in)', 'kde_ix', 'kde_iy',
    'kde_1km', 'kde_density_m2', 'kde_grid_count',
    'kde_grid_density_m2', 'kde_grid_x', 'kde_grid_y',
    'cell1_count', 'cell1_mean_sev'
]

categorical = [
    'Wind_Direction', 'Weather_Condition', 'Sunrise_Sunset',
    'Weather_Clean', 'Weather_Intensity',
    'cell_1km', 'cell_5km', 'kde_cell_kdegrid'
]

boolean = [
    'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
    'Railway', 'Roundabout', 'Station', 'Stop',
    'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
    'Weather_Windy', 'Weather_Thunder'
]


In [38]:
# Drop datetime columns
datetime_cols = X.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns
X = X.drop(columns=datetime_cols)

In [39]:
X_encoded = X.copy()

# Identify high-cardinality columns ( >50 unique values )
high_cardinality = [col for col in categorical if X[col].nunique() > 50]

# Low-cardinality columns (safe to one-hot)
low_cardinality = [col for col in categorical if X[col].nunique() <= 50]

print("High-cardinality:", high_cardinality)
print("Low-cardinality:", low_cardinality)

# One-hot encode only low-cardinality
X_encoded = pd.get_dummies(X_encoded, columns=low_cardinality, drop_first=True)

# Handle high-cardinality with frequency encoding
for col in high_cardinality:
    freqs = X[col].value_counts()
    X_encoded[col] = X[col].map(freqs)

# Ensure boolean are ints
for col in boolean:
    X_encoded[col] = X_encoded[col].astype(int)

# Fill NaN
X_encoded = X_encoded.fillna(0)


High-cardinality: ['Weather_Condition', 'cell_1km', 'cell_5km', 'kde_cell_kdegrid']
Low-cardinality: ['Wind_Direction', 'Sunrise_Sunset', 'Weather_Clean', 'Weather_Intensity']


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, random_state=1, stratify=y
)

In [53]:
print("Original labels:", np.unique(y_train), np.unique(y_test))

# Map labels to 0-based indices for 3 classes
y_train = y_train.copy()
y_test = y_test.copy()

# Example mapping: 1->0, 2->1, 3->2, 4->2 (merge Severe & Extra Severe)
y_train[y_train == 1] = 0
y_train[y_train == 2] = 1
y_train[y_train == 3] = 2
y_train[y_train == 4] = 2

y_test[y_test == 1] = 0
y_test[y_test == 2] = 1
y_test[y_test == 3] = 2
y_test[y_test == 4] = 2

print("Mapped labels:", np.unique(y_train), np.unique(y_test))

Original labels: [1 2 3 4] [1 2 3 4]
Mapped labels: [0 1 2] [0 1 2]


In [54]:
# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [55]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

feature_names = X_train.columns.tolist()  # store for attention visualization

In [56]:
from tensorflow.keras import layers

class FeatureAttentionLayer(layers.Layer):
    
    def __init__(self, attention_dim=64, **kwargs):
        super().__init__(**kwargs)
        self.attention_dim = attention_dim
        
    def build(self, input_shape):
        # Attention scoring network
        self.attention_weights = self.add_weight(
            name='attention_weights',
            shape=(input_shape[-1], self.attention_dim),
            initializer='glorot_uniform',
            trainable=True
        )
        
        self.attention_bias = self.add_weight(
            name='attention_bias',
            shape=(self.attention_dim,),
            initializer='zeros',
            trainable=True
        )
        
        # Output projection
        self.output_weights = self.add_weight(
            name='output_weights',
            shape=(self.attention_dim, input_shape[-1]),
            initializer='glorot_uniform',
            trainable=True
        )
        
        super().build(input_shape)
        
    def call(self, inputs):
        # Compute attention scores
        attention_hidden = tf.nn.relu(
            tf.matmul(inputs, self.attention_weights) + self.attention_bias
        )
        
        attention_scores = tf.matmul(attention_hidden, self.output_weights)
        
        # Normalize to probabilities (sum to 1)
        attention_scores = attention_scores - tf.reduce_max(attention_scores, axis=-1, keepdims=True)
        attention_probs = tf.nn.softmax(attention_scores, axis=-1)

        
        # Apply attention: weight each feature by its importance
        attended_features = inputs * attention_probs  
        attended_features = tf.clip_by_value(attended_features, -5.0, 5.0)

        # Store for visualization 
        self.last_attention_probs = tf.stop_gradient(attention_probs)
        
        return attended_features  
    
    def get_config(self):
        config = super().get_config()
        config.update({'attention_dim': self.attention_dim})
        return config
    
    def compute_output_shape(self, input_shape):
        # Output shape is same as input shape
        return input_shape

print(" FeatureAttentionLayer defined!")


 FeatureAttentionLayer defined!


In [57]:
print("BUILDING NEURAL NETWORK ARCHITECTURE")

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

class FeatureAttentionLayer(layers.Layer):
    def __init__(self, attention_dim=64, **kwargs):
        super(FeatureAttentionLayer, self).__init__(**kwargs)
        self.attention_dim = attention_dim

    def build(self, input_shape):
        self.W = self.add_weight(
            shape=(input_shape[-1], self.attention_dim),
            initializer='glorot_uniform',
            trainable=True,
            name='W_attention'
        )
        self.b = self.add_weight(
            shape=(self.attention_dim,),
            initializer='zeros',
            trainable=True,
            name='b_attention'
        )
        self.u = self.add_weight(
            shape=(self.attention_dim, 1),
            initializer='glorot_uniform',
            trainable=True,
            name='u_attention'
        )
        super(FeatureAttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # Compute attention scores
        v = tf.tanh(tf.matmul(inputs, self.W) + self.b)
        vu = tf.matmul(v, self.u)
        alphas = tf.nn.softmax(vu, axis=1)
        # Weighted sum of inputs
        output = inputs * alphas
        return output

# Build attention model
def build_attention_model(input_dim, num_classes=3, attention_dim=64):
    
    # Input
    inputs = keras.Input(shape=(input_dim,), name='input_features')
    
    # Attention
    attended = FeatureAttentionLayer(attention_dim=attention_dim, name='feature_attention')(inputs)
    
    # Dense layers
    x = layers.Dense(128, activation='relu', name='dense1')(attended)
    x = layers.BatchNormalization(name='bn1')(x)
    x = layers.Dropout(0.3, name='dropout1')(x)
    
    x = layers.Dense(64, activation='relu', name='dense2')(x)
    x = layers.BatchNormalization(name='bn2')(x)
    x = layers.Dropout(0.3, name='dropout2')(x)
    
    x = layers.Dense(32, activation='relu', name='dense3')(x)
    x = layers.BatchNormalization(name='bn3')(x)
    x = layers.Dropout(0.2, name='dropout3')(x)
    
    # Output
    outputs = layers.Dense(num_classes, activation='softmax', name='output')(x)
    
    # Create model
    model = keras.Model(inputs=inputs, outputs=outputs, name='AttentionAccidentModel')
    return model

print("\nBuilding model...")
model = build_attention_model(
    input_dim=X_train_scaled.shape[1],
    num_classes=3,
    attention_dim=64
)

print("\nModel architecture:")
model.summary()


BUILDING NEURAL NETWORK ARCHITECTURE

Building model...

Model architecture:


In [58]:
print("setting up training callbacks")

from tensorflow.keras import callbacks
import os

os.makedirs('models', exist_ok=True)

training_callbacks = [
    callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    ),
    callbacks.ModelCheckpoint(
        'models/best_attention_model.keras',  
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

print("Callbacks configured")


setting up training callbacks
Callbacks configured


In [59]:
import numpy as np

print("DATA VALIDATION CHECK")

# 1. Check NaN / Inf
train_nan = np.isnan(X_train_scaled).sum()
test_nan = np.isnan(X_test_scaled).sum()
train_inf = np.isinf(X_train_scaled).sum()
test_inf = np.isinf(X_test_scaled).sum()

print(f"Train NaN: {train_nan}, Train Inf: {train_inf}")
print(f"Test NaN:  {test_nan}, Test Inf:  {test_inf}")

if train_nan > 0 or train_inf > 0:
    X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0, posinf=0.0, neginf=0.0)
    print("Train data cleaned")

if test_nan > 0 or test_inf > 0:
    X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=0.0, neginf=0.0)
    print("Test data cleaned")

# 2. Check data ranges
print(f"Train min: {X_train_scaled.min():.4f}, max: {X_train_scaled.max():.4f}")
print(f"Train mean: {X_train_scaled.mean():.4f}, std: {X_train_scaled.std():.4f}")

# Clip extreme values
if np.abs(X_train_scaled).max() > 10:
    X_train_scaled = np.clip(X_train_scaled, -10, 10)
    X_test_scaled = np.clip(X_test_scaled, -10, 10)
    print("Extreme values clipped to [-10, 10]")

# 3. Check target variable
print(f"Train target unique: {np.unique(y_train)}, range: [{y_train.min()}, {y_train.max()}]")
print(f"Test target unique: {np.unique(y_test)}")

print("Data validation complete!")


DATA VALIDATION CHECK
Train NaN: 0, Train Inf: 0
Test NaN:  0, Test Inf:  0
Train min: -9.2053, max: 307.0920
Train mean: -0.0000, std: 0.9938
Extreme values clipped to [-10, 10]
Train target unique: [0 1 2], range: [0, 2]
Test target unique: [0 1 2]
Data validation complete!


In [60]:
model = build_attention_model(
    input_dim=X_train_scaled.shape[1],
    num_classes=3,
    attention_dim=64
)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=256,
    callbacks=training_callbacks,
    verbose=1
)

Epoch 1/50
[1m582/590[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.5791 - loss: 0.9298
Epoch 1: val_accuracy improved from None to 0.74626, saving model to models/best_attention_model.keras
[1m590/590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6585 - loss: 0.7772 - val_accuracy: 0.7463 - val_loss: 0.6117 - learning_rate: 0.0010
Epoch 2/50
[1m590/590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7300 - loss: 0.6472
Epoch 2: val_accuracy improved from 0.74626 to 0.75681, saving model to models/best_attention_model.keras
[1m590/590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7380 - loss: 0.6333 - val_accuracy: 0.7568 - val_loss: 0.5902 - learning_rate: 0.0010
Epoch 3/50
[1m578/590[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.7477 - loss: 0.6165
Epoch 3: val_accuracy improved from 0.75681 to 0.76113, saving model to models/be

In [61]:
y_pred_proba = model.predict(X_test_scaled, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

In [63]:
import tensorflow as tf
import pandas as pd

# Get attention layer
attention_layer = model.get_layer("feature_attention")

# Define a new model to output the attention scores
attention_model = tf.keras.Model(
    inputs=model.input,
    outputs=attention_layer.output  # this will return the attention scores
)

# Run prediction to get attention scores
attention_scores = attention_model.predict(X_test_scaled[:500], verbose=0)

# Average over batch and queries to get feature importance
mean_attention = attention_scores.mean(axis=0)  # adjust axis depending on shape
mean_attention = mean_attention.mean(axis=0) if mean_attention.ndim > 1 else mean_attention

# Build dataframe
att_df = pd.DataFrame({
    'Feature': feature_names,
    'Attention_Weight': mean_attention
}).sort_values('Attention_Weight', ascending=False)

print(att_df.head(20))


                   Feature  Attention_Weight
16                    Stop          0.136232
2              Humidity(%)          0.104751
35     Wind_Direction_Calm          0.087900
51    Wind_Direction_South          0.080873
70      Weather_Clean_Rain          0.074341
42      Wind_Direction_NNE          0.059703
41       Wind_Direction_NE          0.058721
46        Wind_Direction_S          0.048469
34          cell1_mean_sev          0.048229
15                 Station          0.045527
60  Sunrise_Sunset_Unknown          0.033037
45    Wind_Direction_North          0.030213
61    Weather_Clean_Cloudy          0.030132
3             Pressure(in)          0.028501
43      Wind_Direction_NNW          0.026514
38      Wind_Direction_ESE          0.025654
57      Wind_Direction_WSW          0.022289
1            Wind_Chill(F)          0.021082
11                Junction          0.019924
58     Wind_Direction_West          0.019304


In [64]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

print("MODEL EVALUATION")

# Make predictions
print("\n1. Making predictions on test set...")
y_pred_proba = model.predict(X_test_scaled, verbose=0)

# Handle multi-class or binary
if y_pred_proba.shape[1] > 1:
    y_pred = np.argmax(y_pred_proba, axis=1)
else:
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred, average=None, zero_division=0
)

precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, y_pred, average='macro', zero_division=0
)

cm = confusion_matrix(y_test, y_pred)

print("\nOVERALL METRICS:")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision_macro:.4f} (macro)")
print(f"   Recall:    {recall_macro:.4f} (macro)")
print(f"   F1-Score:  {f1_macro:.4f} (macro)")

print("\nPER-CLASS METRICS:")
class_names = ['Minor', 'Moderate', 'Severe']
for i, name in enumerate(class_names):
    print(f"\n   {name}:")
    print(f"      Precision: {precision[i]:.4f}")
    print(f"      Recall:    {recall[i]:.4f}")
    print(f"      F1-Score:  {f1[i]:.4f}")
    print(f"      Support:   {support[i]:,}")

print("\nCONFUSION MATRIX:")
print("   (Rows = True, Columns = Predicted)")
print(f"\n   {'':12} {'Minor':>10} {'Moderate':>10} {'Severe':>10}")
for i, name in enumerate(class_names):
    row = f"   {name:12}"
    for j in range(3):
        row += f" {cm[i,j]:10,}"
    print(row)


MODEL EVALUATION

1. Making predictions on test set...

OVERALL METRICS:
   Accuracy:  0.7864
   Precision: 0.7766 (macro)
   Recall:    0.7446 (macro)
   F1-Score:  0.7543 (macro)

PER-CLASS METRICS:

   Minor:
      Precision: 0.7933
      Recall:    0.7805
      F1-Score:  0.7868
      Support:   20,209

   Moderate:
      Precision: 0.7374
      Recall:    0.5414
      F1-Score:  0.6244
      Support:   20,209

   Severe:
      Precision: 0.7991
      Recall:    0.9118
      F1-Score:  0.8518
      Support:   40,417

CONFUSION MATRIX:
   (Rows = True, Columns = Predicted)

                     Minor   Moderate     Severe
   Minor            15,773      1,683      2,753
   Moderate          2,758     10,942      6,509
   Severe            1,353      2,213     36,851
