In [1]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import tensorflow as tf # type: ignore
from tensorflow.keras import layers, Model # type: ignore
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, LSTM, Bidirectional, Input, RepeatVector, Concatenate, LeakyReLU # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from functools import partial
from tqdm import tqdm   # type: ignore
import math

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:

train_df['event_id'] = train_df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
train_df['event_idx'] = train_df.groupby('event_id', sort=False).ngroup()
test_df['event_id'] = test_df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
test_df['event_idx'] = test_df.groupby('event_id', sort=False).ngroup()

train_df['event_t'] = train_df.groupby('event_id').cumcount()
test_df['event_t'] = test_df.groupby('event_id').cumcount()

print(train_df.head())
print(test_df.head())

          event_id  precipitation  label  event_idx  event_t
0  id_spictby0jfsb       0.000000      0          0        0
1  id_spictby0jfsb       0.095438      0          0        1
2  id_spictby0jfsb       1.949560      0          0        2
3  id_spictby0jfsb       3.232160      0          0        3
4  id_spictby0jfsb       0.000000      0          0        4
          event_id  precipitation  event_idx  event_t
0  id_j7b6sokflo4k        0.00000          0        0
1  id_j7b6sokflo4k        3.01864          0        1
2  id_j7b6sokflo4k        0.00000          0        2
3  id_j7b6sokflo4k       16.61520          0        3
4  id_j7b6sokflo4k        2.56706          0        4


In [4]:
BAND_NAMES = ('B2', 'B3','B4', 'B8', 'B11', 'slope')
H, W, NUM_CHANNELS = IMG_DIM= (128, 128, len(BAND_NAMES))
_MAX_INT = np.iinfo(np.int16).max


In [5]:
def decode_slope(X: np.ndarray) -> np.ndarray:
    return (X / _MAX_INT * (math.pi / 2.0 )).astype(np.float32)

In [6]:
def normalize(x: np.ndarray, mean: int, std: int) -> np.ndarray:
    return (x - mean) / std


In [7]:
rough_S2_normalize = partial(normalize, mean=1250, std=500)

In [8]:
def preprocess_image(x: np.ndarray) -> np.ndarray:
    return np.concatenate([
        rough_S2_normalize(x[..., :-1].astype(np.float32)),
        decode_slope(x[..., -1:]),
    ], axis=-1, dtype=np.float32)


In [9]:
composite_images = np.load('composite_images.npz')
images_path = 'composite_images.npz'

In [10]:
def preprocess_data_and_images(data_df, composite_images):
    event_ids = data_df['event_id'].unique()
    timeseries = []
    labels = []
    images = []

    for event_id in tqdm(event_ids, desc="Processing data"):
        event_data = data_df[data_df['event_id'] == event_id]
        timeseries.append(event_data['precipitation'].values)  # Shape: (730,)
        if 'label' in event_data.columns:
            labels.append(event_data['label'].values)  # Shape: (730,)
        images.append(preprocess_image(composite_images[event_id]))  # Shape: (128, 128, 6)

    timeseries = np.array(timeseries)
    labels = np.array(labels) if labels else None
    images = np.stack(images, axis=0)

    return timeseries, labels, images

In [11]:
train_timeseries, train_labels, train_images = preprocess_data_and_images(train_df, composite_images)


Processing data: 100%|██████████| 674/674 [01:19<00:00,  8.43it/s]


In [12]:
# Split into training and validation sets
train_split, val_split = train_test_split(
    np.arange(len(train_timeseries)), test_size=0.1, random_state=42
)

X_precip_train, X_precip_val = train_timeseries[train_split], train_timeseries[val_split]
y_train, y_val = train_labels[train_split], train_labels[val_split]
X_img_train, X_img_val = train_images[train_split], train_images[val_split]

In [13]:
from sklearn.utils.class_weight import compute_class_weight

In [33]:
# Build the custom neural network
# Image encoder
image_input = Input(shape=(128, 128, 6), name='image_input')
x = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.2)(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
encoded_image = Dense(128, activation='relu')(x)

In [34]:
# Time-series input
precip_input = Input(shape=(730,), name='precip_input')
# Custom layer to wrap tf.expand_dims
class ExpandDimsLayer(layers.Layer):
	def call(self, inputs):
		return tf.expand_dims(inputs, axis=-1)

# Repeat the image encoding vector 730 times and concatenate with precipitation data
repeated_image_vector = RepeatVector(730)(encoded_image)
expanded_precip_input = ExpandDimsLayer()(precip_input)
concatenated = Concatenate(axis=-1)([repeated_image_vector, expanded_precip_input])


In [36]:
# Bidirectional LSTM
x = Bidirectional(LSTM(64, return_sequences=True))(concatenated)
x = Dense(64, activation='relu')(x)
day_probabilities = Dense(1, activation='sigmoid')(x)


In [37]:
# Define the model
model = Model(inputs=[image_input, precip_input], outputs=day_probabilities)


In [38]:
# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)

In [39]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels.argmax(axis=1)),
    y=train_labels.argmax(axis=1)
)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

In [40]:
from tensorflow.keras.losses import binary_crossentropy #type: ignore


def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        bce = binary_crossentropy(y_true, y_pred)
        pt = tf.exp(-bce)  # Probabilities of predictions
        focal = alpha * (1 - pt) ** gamma * bce
        return tf.reduce_mean(focal)
    return loss

# Update model compilation
model.compile(optimizer='adam', loss=focal_loss(gamma=2., alpha=0.25), metrics=['accuracy'])


In [41]:
model.fit(
    [X_img_train, X_precip_train], y_train,
    validation_data=([X_img_val, X_precip_val], y_val),
    epochs=50,
    batch_size= 32,
    callbacks=[early_stopping], 
    class_weight=class_weight_dict  
)


Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 5s/step - accuracy: 0.9002 - loss: 0.0049 - val_accuracy: 0.9994 - val_loss: 2.4701e-06
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 4s/step - accuracy: 0.9993 - loss: 1.1308e-06 - val_accuracy: 0.9994 - val_loss: 1.1541e-06
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 3s/step - accuracy: 0.9994 - loss: 5.0257e-07 - val_accuracy: 0.9994 - val_loss: 8.2543e-07
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 3s/step - accuracy: 0.9993 - loss: 4.1647e-07 - val_accuracy: 0.9994 - val_loss: 7.5034e-07
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2s/step - accuracy: 0.9994 - loss: 3.5544e-07 - val_accuracy: 0.9994 - val_loss: 7.0835e-07
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2s/step - accuracy: 0.9993 - loss: 3.7652e-07 - val_accuracy: 0.9994 - val_loss: 6.6806

: 

In [136]:
model.save('flood_prediction_model.keras')

In [24]:
# Preprocess test data
test_timeseries, _, test_images = preprocess_data_and_images(test_df, composite_images)


Processing data: 100%|██████████| 224/224 [00:13<00:00, 16.07it/s]


In [26]:
predictions = model.predict([test_images, test_timeseries])

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 421ms/step


In [31]:
predictions.shape

(224, 730, 1)

In [29]:
sample_submission = pd.read_csv('SampleSubmission (2).csv')
sample_submission.shape

(163520, 2)

In [28]:
sample_submission['label'] = predictions.flatten()
sample_submission.head()

Unnamed: 0,event_id,label
0,id_j7b6sokflo4k_X_0,0.003385
1,id_j7b6sokflo4k_X_1,0.002081
2,id_j7b6sokflo4k_X_2,0.001912
3,id_j7b6sokflo4k_X_3,0.001895
4,id_j7b6sokflo4k_X_4,0.001876


In [30]:
sample_submission.to_csv("submission_435.csv", index=False)
