In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [4]:
test_data = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

In [5]:
X = train_data.drop(columns=['FloodProbability'])
y = train_data['FloodProbability']

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor()
}

best_model = None
best_rmse = float('inf')

# Train and evaluate models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred_train = model.predict(X_train_scaled)
    y_pred_val = model.predict(X_val_scaled)
    
    val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
    
    print(f"Validation RMSE: {val_rmse:.2f}")
    print()
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_model = model

# Make predictions on test data using the best model and save to CSV
print(f"Making predictions using the best model...")
predictions = best_model.predict(test_data_scaled)
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': predictions})
submission.to_csv("best_model_submission.csv", index=False)
print(f"Predictions saved for the best model")

Training Linear Regression...
Validation RMSE: 0.02

Training Decision Tree...
Validation RMSE: 0.05

Training Gradient Boosting...
Validation RMSE: 0.03

Training Support Vector Machine...
Validation RMSE: 0.03

Training XGBoost...
Validation RMSE: 0.02

Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.152583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 21
[LightGBM] [Info] Start training from score 0.504480
Validation RMSE: 0.02

Training CatBoost...
Learning rate set to 0.119817
0:	learn: 0.0501810	total: 184ms	remaining: 3m 4s
1:	learn: 0.0493271	total: 302ms	remaining: 2m 30s
2:	learn: 0.0484614	total: 419ms	remaining: 2m 19s
3:	learn: 0.0476903	total: 532ms	remaining: 2m 12s
4:	learn: 0.0469429	total: 640ms	remaining: 2m 7s
5:	learn: 0.0462105	total: 750ms	remaining: 2m 4s
6:	l

In [6]:
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

2024-05-05 06:04:18.326394: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 06:04:18.326512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 06:04:18.464909: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_data=(X_val_scaled, y_val), verbose=1)

# Evaluate the model


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m  101/27949[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m42s[0m 2ms/step - loss: 0.2124

I0000 00:00:1714889073.885719      85 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - loss: 0.0067 - val_loss: 4.0523e-04
Epoch 2/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 4.3941e-04 - val_loss: 4.0450e-04
Epoch 3/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 4.2892e-04 - val_loss: 4.5228e-04
Epoch 4/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 4.2628e-04 - val_loss: 4.3166e-04
Epoch 5/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 4.2679e-04 - val_loss: 4.3757e-04
Epoch 6/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 4.2660e-04 - val_loss: 4.4419e-04
Epoch 7/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 4.2356e-04 - val_loss: 4.5740e-04
Epoch 8/50
[1m27949/27949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2m

In [8]:
# train_loss = model.evaluate(X_train_scaled, y_train, verbose=0)
# val_loss = model.evaluate(X_val_scaled, y_val, verbose=0)
# print(f"Train loss: {train_loss:.4f}")
# print(f"Validation loss: {val_loss:.4f}")

# Make predictions on test data
predictions = model.predict(test_data_scaled)

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': predictions.flatten()})
submission.to_csv("deep_learning_submission.csv", index=False)
print("Predictions saved for the deep learning model")

[1m23291/23291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1ms/step
Predictions saved for the deep learning model


In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

# Initialize CatBoostRegressor
catboost = CatBoostRegressor(verbose=0)

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Initialize CatBoost with the best parameters
best_catboost = CatBoostRegressor(**best_params, verbose=0)

# Train the model
best_catboost.fit(X_train, y_train)

# Evaluate the model
train_rmse = mean_squared_error(y_train, best_catboost.predict(X_train), squared=False)
val_rmse = mean_squared_error(y_val, best_catboost.predict(X_val), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")

# Make predictions on test data
predictions = best_catboost.predict(test_data)

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': predictions})
submission.to_csv("best_catboost_submission.csv", index=False)
print("Predictions saved for the best CatBoost model")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   8.0s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   8.4s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   8.0s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.05; total time=   6.9s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.05; total time=   8.1s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.05; total time=   7.5s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=   6.6s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=   6.5s
[CV] END depth=6, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=   6.5s
[CV] END depth=6, iterations=100, l2_leaf_reg=3, learning_rate=0.01; total time=   9.0s
[CV] END depth=6, iterations=100, l2_leaf_reg=3, learning_rat