In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model #type: ignore
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM #type: ignore
from tensorflow.keras.callbacks import EarlyStopping #type: ignore
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight


In [25]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [26]:

train_df['event_idx'] = train_df.groupby('event_id', sort=False).ngroup()
test_df['event_idx'] = test_df.groupby('event_id', sort=False).ngroup()

train_df['event_t'] = train_df.groupby('event_id').cumcount()
test_df['event_t'] = test_df.groupby('event_id').cumcount()

print(train_df.head())
print(test_df.head())

              event_id  precipitation  label  event_idx  event_t
0  id_spictby0jfsb_X_0       0.000000      0          0        0
1  id_spictby0jfsb_X_1       0.095438      0          1        0
2  id_spictby0jfsb_X_2       1.949560      0          2        0
3  id_spictby0jfsb_X_3       3.232160      0          3        0
4  id_spictby0jfsb_X_4       0.000000      0          4        0
              event_id  precipitation  event_idx  event_t
0  id_j7b6sokflo4k_X_0        0.00000          0        0
1  id_j7b6sokflo4k_X_1        3.01864          1        0
2  id_j7b6sokflo4k_X_2        0.00000          2        0
3  id_j7b6sokflo4k_X_3       16.61520          3        0
4  id_j7b6sokflo4k_X_4        2.56706          4        0


In [27]:
# Ensure data is sorted by event_id and event_t to maintain sequence order
train_df.sort_values(by=['event_id', 'event_t'], inplace=True)
test_df.sort_values(by=['event_id', 'event_t'], inplace=True)


In [28]:
# Keep precipitation values row-wise (No aggregation)
X_train = train_df[['precipitation', 'event_t']].values  # Keep both precipitation & time
X_test = test_df[['precipitation', 'event_t']].values    # Same for test
y_train = train_df['label'].values  # Keep row-wise labels


In [29]:
# Normalize precipitation
X_train[:, 0] = (X_train[:, 0] - np.mean(X_train[:, 0])) / np.std(X_train[:, 0])
X_test[:, 0] = (X_test[:, 0] - np.mean(X_test[:, 0])) / np.std(X_test[:, 0])


In [30]:
# Reshape for LSTM (samples, timesteps, features)
X_train = X_train.reshape(X_train.shape[0], 1, 2)
X_test = X_test.reshape(X_test.shape[0], 1, 2)

In [31]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}


In [32]:
# Define LSTM model
input_precip = Input(shape=(1, 2))  # Single timestep with 2 features (precipitation, event_t)
x = LSTM(32, return_sequences=True)(input_precip)
x = LSTM(16)(x)
x = Dropout(0.3)(x)
x = Dense(8, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)  # Binary classification

model = Model(inputs=input_precip, outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [33]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [43]:
# Train the model
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    #callbacks=[early_stopping],
    class_weight=class_weight_dict
)


Epoch 1/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 6ms/step - accuracy: 0.8653 - loss: 0.6063 - val_accuracy: 0.8525 - val_loss: 0.6273
Epoch 2/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 5ms/step - accuracy: 0.8576 - loss: 0.6001 - val_accuracy: 0.8620 - val_loss: 0.5457
Epoch 3/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 6ms/step - accuracy: 0.8613 - loss: 0.5562 - val_accuracy: 0.8518 - val_loss: 0.6881
Epoch 4/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 6ms/step - accuracy: 0.8596 - loss: 0.5919 - val_accuracy: 0.8692 - val_loss: 0.5338
Epoch 5/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 5ms/step - accuracy: 0.8659 - loss: 0.5730 - val_accuracy: 0.8887 - val_loss: 0.4022
Epoch 6/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 6ms/step - accuracy: 0.8717 - loss: 0.5688 - val_accuracy: 0.8530 - val_loss:

<keras.src.callbacks.history.History at 0x2cf09649d60>

In [44]:
# Make predictions on the test set
test_predictions = model.predict(X_test)

[1m5110/5110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step


In [45]:
test_predictions.shape

(163520, 1)

In [47]:
submission_df = pd.DataFrame({'event_id': test_df['event_id'], 'label': test_predictions.flatten()})


In [48]:
submission_df.to_csv('predictions_four.csv', index=False)


In [21]:
X_test.shape

(163520,)

In [20]:
sample_submission = pd.read_csv('Test.csv')
sample_submission.shape

(163520, 2)

In [17]:
sample_submission['label'] = test_predictions.flatten()
sample_submission.head()

ValueError: Length of values (224) does not match length of index (163520)

In [44]:
test_df['event_id'].shape

(163520,)

In [59]:
# Ensure final_test_predictions has the correct length
num_test_events = len(test_df['event_id'].unique())

if len(test_predictions) != num_test_events:
    print(f"Warning: Expected {num_test_events} predictions, but got {len(test_predictions)}")
    test_predictions = test_predictions[:num_test_events]  # Trim if necessary


In [None]:
sample_submission = pd.read_csv('SampleSubmission (2).csv')
sample_submission.head()

In [62]:
# Save predictions
submission_df = pd.DataFrame({'event_id': sample_submission['event_id'].unique(), 'prediction': test_predictions.flatten()})
submission_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")


ValueError: All arrays must be of the same length