In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model #type: ignore
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, GlobalAveragePooling2D, concatenate #type:ignore
from tensorflow.keras.applications import ResNet50  #type: ignore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import math
from functools import partial
from tqdm import tqdm

In [2]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
composite_images = np.load('composite_images.npz')
images_path = 'composite_images.npz'

In [3]:
images = np.load(images_path)

In [4]:

train_data['event_id_2'] = train_data['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
train_data['event_idx'] = train_data.groupby('event_id', sort=False).ngroup()
test_data['event_id_2'] = test_data['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
test_data['event_idx'] = test_data.groupby('event_id', sort=False).ngroup()

train_data['event_t'] = train_data.groupby('event_id').cumcount()
test_data['event_t'] = test_data.groupby('event_id').cumcount()

print(train_data.head())
print(test_data.head())

              event_id  precipitation  label       event_id_2  event_idx  \
0  id_spictby0jfsb_X_0       0.000000      0  id_spictby0jfsb          0   
1  id_spictby0jfsb_X_1       0.095438      0  id_spictby0jfsb          1   
2  id_spictby0jfsb_X_2       1.949560      0  id_spictby0jfsb          2   
3  id_spictby0jfsb_X_3       3.232160      0  id_spictby0jfsb          3   
4  id_spictby0jfsb_X_4       0.000000      0  id_spictby0jfsb          4   

   event_t  
0        0  
1        0  
2        0  
3        0  
4        0  
              event_id  precipitation       event_id_2  event_idx  event_t
0  id_j7b6sokflo4k_X_0        0.00000  id_j7b6sokflo4k          0        0
1  id_j7b6sokflo4k_X_1        3.01864  id_j7b6sokflo4k          1        0
2  id_j7b6sokflo4k_X_2        0.00000  id_j7b6sokflo4k          2        0
3  id_j7b6sokflo4k_X_3       16.61520  id_j7b6sokflo4k          3        0
4  id_j7b6sokflo4k_X_4        2.56706  id_j7b6sokflo4k          4        0


In [5]:
#Preprocessing functions
_MAX_INT = np.iinfo(np.uint16).max

In [6]:
def decode_slope(x: np.ndarray) -> np.ndarray:
    # Convert 16-bit discretized slope to float32 radians
    return (x / _MAX_INT * (math.pi / 2.0)).astype(np.float32)


In [7]:
def normalize(x: np.ndarray, mean: int, std: int) -> np.ndarray:
    return (x - mean) / std


In [8]:
rough_S2_normalize = partial(normalize, mean=1250, std=500)

In [9]:
def preprocess_image(x: np.ndarray) -> np.ndarray:
    return np.concatenate([
        rough_S2_normalize(x[..., :-1].astype(np.float32)),
        decode_slope(x[..., -1:]),
    ], axis=-1, dtype=np.float32)


In [10]:
scaler = MinMaxScaler()
train_data['precipitation'] = scaler.fit_transform(train_data[['precipitation']])
test_data['precipitation'] = scaler.transform(test_data[['precipitation']])

In [11]:
# Prepare training and validation splits
rng = np.random.default_rng(seed=0xf100d)
event_ids = train_data['event_id'].unique()
new_split = pd.Series(
    data=rng.choice(['train', 'valid'], size=len(event_ids), p=[0.9, 0.1]),
    index=event_ids,
    name='split',
)
data_new = train_data.join(new_split, on='event_id')


In [12]:
train_df = data_new[(data_new['split'] == 'train')]
train_timeseries = train_df.pivot(index='event_id', columns='event_t', values='precipitation').to_numpy()
train_labels = train_df.pivot(index='event_id', columns='event_t', values='label').to_numpy()

valid_df = data_new[data_new['split'] == 'valid']
valid_timeseries = valid_df.pivot(index='event_id', columns='event_t', values='precipitation').to_numpy()
valid_labels = valid_df.pivot(index='event_id', columns='event_t', values='label').to_numpy()


In [13]:
# For the test set there are no labels
test_timeseries = test_data.pivot(index='event_id', columns='event_t', values='precipitation').to_numpy()

In [14]:
train_images, valid_images, test_images = [], [], []


In [15]:
# Process train and validation images
for event_id in tqdm(data_new['event_id_2'].unique()):
    img = preprocess_image(images[event_id])
    if data_new[data_new['event_id_2'] == event_id]['split'].iloc[0] == 'train':
        train_images.append(img)
    else:
        valid_images.append(img)


100%|██████████| 674/674 [00:40<00:00, 16.53it/s]


In [16]:
# Process test images
for event_id in tqdm(test_data['event_id_2'].unique()):
    img = preprocess_image(images[event_id])
    test_images.append(img)

  0%|          | 0/224 [00:00<?, ?it/s]

100%|██████████| 224/224 [00:01<00:00, 181.97it/s]


In [17]:
train_images = np.stack(train_images, axis=0)
valid_images = np.stack(valid_images, axis=0)
test_images = np.stack(test_images, axis=0)

In [18]:
def create_time_series_resnet(input_shape):
    inputs = Input(shape=input_shape)
    x = Conv1D(64, kernel_size = 3, activation='selu', padding ='same')(inputs)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(128, kernel_size=3, activation = 'selu', padding = 'same')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(256, kernel_size=3, activation='selu', padding='same')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Flatten()(x)
    x = Dropout(0.3)(x)
    outputs = Dense(64, activation='relu')(x)
    return Model(inputs, outputs)

In [19]:
# Assuming the input shape should be (steps, input_dim)
# For example, if you have 10 time steps and 1 feature, the input shape should be (10, 1)
precipitation_model = create_time_series_resnet((train_timeseries.shape[1], 1))

In [20]:
# Define image model (ResNet)
image_input = Input(shape=(train_images.shape[1], train_images.shape[2], train_images.shape[3]))
base_model = ResNet50(weights=None, include_top=False, input_tensor=image_input)
image_features = GlobalAveragePooling2D()(base_model.output)
image_model = Model(inputs=image_input, outputs=image_features)


In [21]:
# Combine models
combined_input_precipitation = Input(shape=(train_timeseries.shape[1], 1))
combined_precipitation_features = precipitation_model(combined_input_precipitation)

combined_input_image = Input(shape=(train_images.shape[1], train_images.shape[2], train_images.shape[3]))
combined_image_features = image_model(combined_input_image)

combined = concatenate([combined_precipitation_features, combined_image_features])
x = Dense(128, activation="relu")(combined)
x = Dropout(0.3)(x)
output = Dense(1, activation="sigmoid")(x)

model = Model(inputs=[combined_input_precipitation, combined_input_image], outputs=output)

In [22]:
# Compile model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [24]:
# Reduce target labels to match the model's output shape
train_labels_reduced = np.max(train_labels, axis=1)
valid_labels_reduced = np.max(valid_labels, axis=1)

# Ensure the shapes of the input arrays match
train_labels_reduced = train_labels_reduced[:train_timeseries.shape[0]]
valid_labels_reduced = valid_labels_reduced[:valid_timeseries.shape[0]]

# Ensure the shapes of the image arrays match
train_images = train_images[:train_timeseries.shape[0]]
valid_images = valid_images[:valid_timeseries.shape[0]]

# Ensure the model is compiled before fitting
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train model
history = model.fit(
    [train_timeseries[..., np.newaxis], train_images],
    train_labels_reduced,
    validation_data=([valid_timeseries[..., np.newaxis], valid_images], valid_labels_reduced),
    epochs=20,
    batch_size=32
)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 442716, 614
'y' sizes: 442716


In [36]:
# Evaluate model
y_val_pred = model.predict([valid_timeseries[..., np.newaxis], valid_images])
print(f"Log Loss: {log_loss(valid_labels_reduced, y_val_pred)}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 729ms/step
Log Loss: 1.6936372370331882


In [37]:
# Predict on test data
y_test_pred = model.predict([test_timeseries[..., np.newaxis], test_images])


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step


In [41]:
# Ensure the lengths match

# Save predictions for submission
submission = pd.DataFrame({
    "event_id": test_data["event_id"].unique(),
    "label": y_test_pred.flatten()[:len(test_data["event_id"])]
})

In [43]:
submission.to_csv("submission_300.csv", index=False)
