In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
import optuna  # Hyperparameter tuning


In [19]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [20]:
def create_event_features(df):
    df['event_idx'] = df.groupby('event_id', sort=False).ngroup()
    df['event_t'] = df.groupby('event_id').cumcount()
    return df

In [21]:
train_df = create_event_features(train_df)
test_df = create_event_features(test_df)    

In [22]:
def create_lag_features(df):
    pivoted = df.pivot(index='event_id', columns='event_t', values='precipitation').fillna(0)
    pivoted.columns = [f'lag_{i}' for i in range(pivoted.shape[1])]
    pivoted.reset_index(inplace=True)
    return pivoted

train_pivot = create_lag_features(train_df)
test_pivot = create_lag_features(test_df)


In [23]:
train_pivot.head

<bound method NDFrame.head of                      event_id     lag_0
0         id_05v6zjuaf300_X_0   0.00000
1         id_05v6zjuaf300_X_1   9.45176
2        id_05v6zjuaf300_X_10   8.49763
3       id_05v6zjuaf300_X_100   0.00000
4       id_05v6zjuaf300_X_101   0.00000
...                       ...       ...
492015   id_zyy86zjmrnx2_X_95   9.61178
492016   id_zyy86zjmrnx2_X_96  12.31130
492017   id_zyy86zjmrnx2_X_97  19.92590
492018   id_zyy86zjmrnx2_X_98   0.00000
492019   id_zyy86zjmrnx2_X_99  16.21550

[492020 rows x 2 columns]>

In [24]:
# Merge labels back to train set
train_labels = train_df[['event_id', 'label']].groupby('event_id').max().reset_index()
train_data = train_pivot.merge(train_labels, on='event_id', how='left')
test_data = test_pivot.copy()


In [39]:
# Prepare training features and target
X = train_data.drop(columns=['event_id', 'label'])
y = train_data['label']
X_test = test_data.drop(columns=['event_id'])

# Train LightGBM with stratified 10-fold CV
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.08,
    'num_leaves': 100,
    'max_depth': 12,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}


In [40]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    
    model = lgb.train({**params, 'early_stopping_rounds': 50}, dtrain, valid_sets=[dval])
    
    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits


In [44]:
test_df['Target'] = test_preds
submission = test_df[['event_id', 'event_t', 'Target']]
submission[['event_id', 'Target']].to_csv('submission_light2.csv', index=False)


In [45]:
sample_submission = pd.read_csv('submission_light2.csv')
sample_submission.head(30)

Unnamed: 0,event_id,Target
0,id_j7b6sokflo4k_X_0,0.00047
1,id_j7b6sokflo4k_X_1,0.00047
2,id_j7b6sokflo4k_X_2,0.00047
3,id_j7b6sokflo4k_X_3,0.00047
4,id_j7b6sokflo4k_X_4,0.00047
5,id_j7b6sokflo4k_X_5,0.00047
6,id_j7b6sokflo4k_X_6,0.00047
7,id_j7b6sokflo4k_X_7,0.00047
8,id_j7b6sokflo4k_X_8,0.00047
9,id_j7b6sokflo4k_X_9,0.001768


In [None]:
sample_submission['label'] = pred.flatten()
sample_submission.head()

## Model 3

In [11]:
# Load Data
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

# Create Unique Identifiers
train_df['event_idx'] = train_df.groupby('event_id', sort=False).ngroup()
test_df['event_idx'] = test_df.groupby('event_id', sort=False).ngroup()

train_df['event_t'] = train_df.groupby('event_id').cumcount()
test_df['event_t'] = test_df.groupby('event_id').cumcount()

# Pivot Data to Create Features
train_pivot = train_df.pivot(index='event_idx', columns='event_t', values='precipitation').fillna(0)
test_pivot = test_df.pivot(index='event_idx', columns='event_t', values='precipitation').fillna(0)

# Merge Labels
train_labels = train_df.groupby('event_idx')['label'].max()

X_train = train_pivot.values
y_train = train_labels.values
X_test = test_pivot.values


In [12]:
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


In [13]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 200),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'early_stopping_rounds': 50,
        'verbose': -1
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kf.split(X_train_balanced, y_train_balanced):
        X_tr, X_val = X_train_balanced[train_idx], X_train_balanced[val_idx]
        y_tr, y_val = y_train_balanced[train_idx], y_train_balanced[val_idx]
        
        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(params, train_data, valid_sets=[val_data])
        preds = model.predict(X_val)
        log_loss = -np.mean(y_val * np.log(preds) + (1 - y_val) * np.log(1 - preds))
        scores.append(log_loss)
    
    return np.mean(scores)

# Run Optuna to Find Best Hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Get Best Parameters
best_params = study.best_params
print("Best Parameters:", best_params)


[I 2025-02-06 18:05:55,330] A new study created in memory with name: no-name-aa55fab2-8d04-4cc7-9dba-1ee9fc1b79dc


[I 2025-02-06 18:06:29,742] Trial 0 finished with value: 0.6607155204844395 and parameters: {'learning_rate': 0.0013467363364897973, 'num_leaves': 137, 'max_depth': 7, 'min_data_in_leaf': 188, 'bagging_fraction': 0.9994598284724872, 'feature_fraction': 0.6239714877455595, 'lambda_l1': 3.1692407575521053, 'lambda_l2': 3.719479414960052}. Best is trial 0 with value: 0.6607155204844395.
[I 2025-02-06 18:07:08,072] Trial 1 finished with value: 0.6470949382952879 and parameters: {'learning_rate': 0.0020172083681393948, 'num_leaves': 156, 'max_depth': 15, 'min_data_in_leaf': 27, 'bagging_fraction': 0.872610397887672, 'feature_fraction': 0.7865425543851855, 'lambda_l1': 8.967865690517106, 'lambda_l2': 9.12471837924321}. Best is trial 1 with value: 0.6470949382952879.
[I 2025-02-06 18:07:28,421] Trial 2 finished with value: 0.5446228119068692 and parameters: {'learning_rate': 0.06273332639124533, 'num_leaves': 98, 'max_depth': 3, 'min_data_in_leaf': 21, 'bagging_fraction': 0.8081730695611941, 

Best Parameters: {'learning_rate': 0.0984542538328925, 'num_leaves': 118, 'max_depth': 12, 'min_data_in_leaf': 43, 'bagging_fraction': 0.781492386940516, 'feature_fraction': 0.9944985980172595, 'lambda_l1': 1.9497704829672813, 'lambda_l2': 1.6793691820602854}


In [14]:
# Final Models with Best Params
lgb_model = lgb.LGBMClassifier(**best_params)
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', learning_rate=0.01, max_depth=6, n_estimators=1000)
cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.01, depth=6, loss_function='Logloss', verbose=100)

# Train Models
lgb_model.fit(X_train_balanced, y_train_balanced)
xgb_model.fit(X_train_balanced, y_train_balanced)
cat_model.fit(X_train_balanced, y_train_balanced)

# Make Predictions
lgb_preds = lgb_model.predict_proba(X_test)[:, 1]
xgb_preds = xgb_model.predict_proba(X_test)[:, 1]
cat_preds = cat_model.predict_proba(X_test)[:, 1]

# Ensembling (Averaging Predictions)
final_preds = (lgb_preds + xgb_preds + cat_preds) / 3


0:	learn: 0.6902778	total: 231ms	remaining: 3m 50s
100:	learn: 0.5692601	total: 22.9s	remaining: 3m 23s
200:	learn: 0.5520777	total: 46s	remaining: 3m 2s
300:	learn: 0.5473462	total: 1m 6s	remaining: 2m 33s
400:	learn: 0.5448753	total: 1m 28s	remaining: 2m 11s
500:	learn: 0.5430828	total: 1m 49s	remaining: 1m 48s
600:	learn: 0.5421686	total: 2m 9s	remaining: 1m 26s
700:	learn: 0.5406604	total: 2m 34s	remaining: 1m 5s
800:	learn: 0.5400093	total: 2m 56s	remaining: 43.9s
900:	learn: 0.5391987	total: 3m 20s	remaining: 22s
999:	learn: 0.5385795	total: 3m 44s	remaining: 0us


In [15]:
submission = pd.DataFrame({'event_id': test_df['event_id'], 'label': final_preds})
submission.to_csv('submission_ensemble2.csv', index=False)


In [16]:
submission.head()

Unnamed: 0,event_id,label
0,id_j7b6sokflo4k_X_0,0.319678
1,id_j7b6sokflo4k_X_1,0.503323
2,id_j7b6sokflo4k_X_2,0.319678
3,id_j7b6sokflo4k_X_3,0.69224
4,id_j7b6sokflo4k_X_4,0.427484


In [28]:
# Keep precipitation values row-wise (No aggregation)
X_train = train_df[['precipitation', 'event_t']].values  # Keep both precipitation & time
X_test = test_df[['precipitation', 'event_t']].values    # Same for test
y_train = train_df['label'].values  # Keep row-wise labels


In [29]:
# Normalize precipitation
X_train[:, 0] = (X_train[:, 0] - np.mean(X_train[:, 0])) / np.std(X_train[:, 0])
X_test[:, 0] = (X_test[:, 0] - np.mean(X_test[:, 0])) / np.std(X_test[:, 0])


In [30]:
# Reshape for LSTM (samples, timesteps, features)
X_train = X_train.reshape(X_train.shape[0], 1, 2)
X_test = X_test.reshape(X_test.shape[0], 1, 2)

In [31]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}


In [32]:
# Define LSTM model
input_precip = Input(shape=(1, 2))  # Single timestep with 2 features (precipitation, event_t)
x = LSTM(32, return_sequences=True)(input_precip)
x = LSTM(16)(x)
x = Dropout(0.3)(x)
x = Dense(8, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)  # Binary classification

model = Model(inputs=input_precip, outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [33]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [43]:
# Train the model
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    #callbacks=[early_stopping],
    class_weight=class_weight_dict
)


Epoch 1/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 6ms/step - accuracy: 0.8653 - loss: 0.6063 - val_accuracy: 0.8525 - val_loss: 0.6273
Epoch 2/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 5ms/step - accuracy: 0.8576 - loss: 0.6001 - val_accuracy: 0.8620 - val_loss: 0.5457
Epoch 3/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 6ms/step - accuracy: 0.8613 - loss: 0.5562 - val_accuracy: 0.8518 - val_loss: 0.6881
Epoch 4/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 6ms/step - accuracy: 0.8596 - loss: 0.5919 - val_accuracy: 0.8692 - val_loss: 0.5338
Epoch 5/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 5ms/step - accuracy: 0.8659 - loss: 0.5730 - val_accuracy: 0.8887 - val_loss: 0.4022
Epoch 6/100
[1m13839/13839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 6ms/step - accuracy: 0.8717 - loss: 0.5688 - val_accuracy: 0.8530 - val_loss:

<keras.src.callbacks.history.History at 0x2cf09649d60>

In [44]:
# Make predictions on the test set
test_predictions = model.predict(X_test)

[1m5110/5110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step


In [45]:
test_predictions.shape

(163520, 1)

In [47]:
submission_df = pd.DataFrame({'event_id': test_df['event_id'], 'label': test_predictions.flatten()})


In [48]:
submission_df.to_csv('predictions_four.csv', index=False)


In [21]:
X_test.shape

(163520,)

In [20]:
sample_submission = pd.read_csv('Test.csv')
sample_submission.shape

(163520, 2)

In [17]:
sample_submission['label'] = test_predictions.flatten()
sample_submission.head()

ValueError: Length of values (224) does not match length of index (163520)

In [44]:
test_df['event_id'].shape

(163520,)

In [59]:
# Ensure final_test_predictions has the correct length
num_test_events = len(test_df['event_id'].unique())

if len(test_predictions) != num_test_events:
    print(f"Warning: Expected {num_test_events} predictions, but got {len(test_predictions)}")
    test_predictions = test_predictions[:num_test_events]  # Trim if necessary


In [None]:
sample_submission = pd.read_csv('SampleSubmission (2).csv')
sample_submission.head()

In [62]:
# Save predictions
submission_df = pd.DataFrame({'event_id': sample_submission['event_id'].unique(), 'prediction': test_predictions.flatten()})
submission_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")


ValueError: All arrays must be of the same length