In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, Model #type: ignore
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, LSTM, Bidirectional, Input, RepeatVector, Concatenate # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from tensorflow.keras.preprocessing.image import ImageDataGenerator # type: ignore
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from functools import partial
from tqdm import tqdm   # type: ignore
import math

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df['event_id'] = train_df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
train_df['event_idx'] = train_df.groupby('event_id', sort=False).ngroup()
test_df['event_id'] = test_df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
test_df['event_idx'] = test_df.groupby('event_id', sort=False).ngroup()

train_df['event_t'] = train_df.groupby('event_id').cumcount()
test_df['event_t'] = test_df.groupby('event_id').cumcount()

print(train_df.head())
print(test_df.head())

          event_id  precipitation  label  event_idx  event_t
0  id_spictby0jfsb       0.000000      0          0        0
1  id_spictby0jfsb       0.095438      0          0        1
2  id_spictby0jfsb       1.949560      0          0        2
3  id_spictby0jfsb       3.232160      0          0        3
4  id_spictby0jfsb       0.000000      0          0        4
          event_id  precipitation  event_idx  event_t
0  id_j7b6sokflo4k        0.00000          0        0
1  id_j7b6sokflo4k        3.01864          0        1
2  id_j7b6sokflo4k        0.00000          0        2
3  id_j7b6sokflo4k       16.61520          0        3
4  id_j7b6sokflo4k        2.56706          0        4


In [4]:
BAND_NAMES = ('B2', 'B3','B4', 'B8', 'B11', 'slope')
H, W, NUM_CHANNELS = IMG_DIM= (128, 128, len(BAND_NAMES))
_MAX_INT = np.iinfo(np.int16).max

def decode_slope(X: np.ndarray) -> np.ndarray:
    return (X / _MAX_INT * (math.pi / 2.0 )).astype(np.float32)

def normalize(x: np.ndarray, mean: int, std: int) -> np.ndarray:
    return (x - mean) / std
rough_S2_normalize = partial(normalize, mean=1250, std=500)

In [5]:
def preprocess_image(x: np.ndarray) -> np.ndarray:
    return np.concatenate([
        rough_S2_normalize(x[..., :-1].astype(np.float32)),
        decode_slope(x[..., -1:]),
    ], axis=-1, dtype=np.float32)
composite_images = np.load('composite_images.npz')
images_path = 'composite_images.npz'

In [6]:
def preprocess_data_and_images(data_df, composite_images):
    event_ids = data_df['event_id'].unique()

    timeseries = []
    labels = []
    images = []

    for event_id in tqdm(event_ids, desc="Processing data"):
        event_data = data_df[data_df['event_id'] == event_id]
        timeseries.append(event_data['precipitation'].values)  # Shape: (730,)
        if 'label' in event_data.columns:
            labels.append(event_data['label'].values)  # Shape: (730,)
        images.append(preprocess_image(composite_images[event_id]))  # Shape: (128, 128, 6)

    timeseries = np.array(timeseries)
    labels = np.array(labels) if labels else None
    images = np.stack(images, axis=0)

    return timeseries, labels, images

In [7]:
train_timeseries, train_labels, train_images = preprocess_data_and_images(train_df, composite_images)

Processing data: 100%|██████████| 674/674 [00:50<00:00, 13.40it/s]


In [8]:
data_gen = ImageDataGenerator(
    rotation_range =15,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    shear_range = 0.1,
    zoom_range = 0.1,
    horizontal_flip = True,
    fill_mode = 'nearest'
)

In [9]:
# Split into training and validation sets
train_split, val_split = train_test_split(
    np.arange(len(train_timeseries)), test_size=0.1, random_state=42
)

X_precip_train, X_precip_val = train_timeseries[train_split], train_timeseries[val_split]
y_train, y_val = train_labels[train_split], train_labels[val_split]
X_img_train, X_img_val = train_images[train_split], train_images[val_split]

In [10]:
from sklearn.utils.class_weight import compute_class_weight # type: ignore
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}


In [11]:
# Image encoder
image_input = Input(shape=(128, 128, 6), name='image_input')
x = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.2)(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
encoded_image = Dense(128, activation='relu')(x)


In [12]:
# Time-series input
precip_input = Input(shape=(730,), name='precip_input')

# Custom layer to wrap tf.expand_dims
class ExpandDimsLayer(layers.Layer):
    def call(self, inputs):
        return tf.expand_dims(inputs, axis=-1)

# Repeat the image encoding vector 730 times and concatenate with precipitation data
repeated_image_vector = RepeatVector(730)(encoded_image)
expanded_precip_input = ExpandDimsLayer()(precip_input)
concatenated = Concatenate(axis=-1)([repeated_image_vector, expanded_precip_input])

# Bidirectional LSTM
x = Bidirectional(LSTM(64, return_sequences=True))(concatenated)
x = Dense(64, activation='relu')(x)
day_probabilities = Dense(1, activation='sigmoid')(x)




In [13]:
# Define the model
model = Model(inputs=[image_input, precip_input], outputs=day_probabilities)

In [14]:
from keras_tuner import RandomSearch

In [15]:
def build_model (hp):
    hp_filters = hp.Int('filters', min_value=32, max_value=128, step = 16)
    hp_lstm_units = hp.Int('lstm_units', min_value =32, max_value = 128, step = 16)
    x = Conv2D(hp_filters, (3, 3), activation='relu', padding='same')(image_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(hp_filters * 2, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    encoded_image = Dense(128, activation='relu')(x)

    repeated_image_vector = RepeatVector(730)(encoded_image)
    expanded_precip_input = ExpandDimsLayer()(precip_input)
    concatenated = Concatenate(axis=-1)([repeated_image_vector, expanded_precip_input])

    x = Bidirectional(LSTM(hp_lstm_units, return_sequences=True))(concatenated)
    x = Dense(64, activation='relu')(x)
    day_probabilities = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[image_input, precip_input], outputs=day_probabilities)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [16]:
tuner = RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=5,
    executions_per_trial=1,
    directory="hyperparam_tuning",
    project_name="flood_prediction"
)

Reloading Tuner from hyperparam_tuning\flood_prediction\tuner0.json


In [17]:
tuner.search(
    x=[X_img_train, X_precip_train],
    y=y_train,
    epochs=10,
    validation_data=([X_img_val, X_precip_val], y_val),
    class_weight=class_weight_dict
)

Trial 5 Complete [00h 04m 04s]
val_accuracy: 0.9994158148765564

Best val_accuracy So Far: 0.9994158148765564
Total elapsed time: 01h 00m 32s


In [18]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:", best_hps.values)

Best Hyperparameters: {'filters': 112, 'lstm_units': 96}
