In [None]:
import pandas as pd

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
test_data = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

In [None]:
X = train_data.drop(columns=['FloodProbability'])
y = train_data['FloodProbability']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor()
}

best_model = None
best_rmse = float('inf')

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred_train = model.predict(X_train_scaled)
    y_pred_val = model.predict(X_val_scaled)
    
    val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
    
    print(f"Validation RMSE: {val_rmse:.2f}")
    print()
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_model = model

print(f"Making predictions using the best model...")
predictions = best_model.predict(test_data_scaled)
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': predictions})
submission.to_csv("best_model_submission.csv", index=False)
print(f"Predictions saved for the best model")

In [None]:
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer=Adam(), loss='mean_squared_error')

history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_data=(X_val_scaled, y_val), verbose=1)



In [None]:

predictions = model.predict(test_data_scaled)

submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': predictions.flatten()})
submission.to_csv("deep_learning_submission.csv", index=False)
print("Predictions saved for the deep learning model")

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

# Initialize CatBoostRegressor
catboost = CatBoostRegressor(verbose=0)

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Initialize CatBoost with the best parameters
best_catboost = CatBoostRegressor(**best_params, verbose=0)

# Train the model
best_catboost.fit(X_train, y_train)

# Evaluate the model
train_rmse = mean_squared_error(y_train, best_catboost.predict(X_train), squared=False)
val_rmse = mean_squared_error(y_val, best_catboost.predict(X_val), squared=False)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")

# Make predictions on test data
predictions = best_catboost.predict(test_data)

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': predictions})
submission.to_csv("best_catboost_submission.csv", index=False)
print("Predictions saved for the best CatBoost model")

In [48]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [49]:
df = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [50]:
df.isna().sum()

id                                 0
MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 22 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   id                               1117957 non-null  int64  
 1   MonsoonIntensity                 1117957 non-null  int64  
 2   TopographyDrainage               1117957 non-null  int64  
 3   RiverManagement                  1117957 non-null  int64  
 4   Deforestation                    1117957 non-null  int64  
 5   Urbanization                     1117957 non-null  int64  
 6   ClimateChange                    1117957 non-null  int64  
 7   DamsQuality                      1117957 non-null  int64  
 8   Siltation                        1117957 non-null  int64  
 9   AgriculturalPractices            1117957 non-null  int64  
 10  Encroachments                    1117957 non-null  int64  
 11  IneffectiveDisasterPreparedness  1117957 non-null 

In [52]:
for column in df.columns : 
    print(column)
    print(df[column].unique())
    print("---------------------------------------------------------------")

id
[      0       1       2 ... 1117954 1117955 1117956]
---------------------------------------------------------------
MonsoonIntensity
[ 5  6  3  8  4  7  9  2 10  1  0 11 12 13 15 14 16]
---------------------------------------------------------------
TopographyDrainage
[ 8  7  5  4  3  6  2  1  9 10 12  0 11 14 13 16 15 17 18]
---------------------------------------------------------------
RiverManagement
[ 5  4  6  2  1  8  3  0  9  7 10 11 12 15 13 14 16]
---------------------------------------------------------------
Deforestation
[ 8  4  7  5  6  2  3  9  0 10  1 13 11 12 14 15 16 17]
---------------------------------------------------------------
Urbanization
[ 6  8  3  4  2  5 10  7  9 11  1  0 12 13 16 14 15 17]
---------------------------------------------------------------
ClimateChange
[ 4  8  7  5  6  3  2  1  0 10  9 12 11 13 14 15 16 17]
---------------------------------------------------------------
DamsQuality
[ 4  3  1  6  2  5  8  7  9 12 11 10  0 14 13 15 16]
----

In [53]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
test_df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5


In [54]:

for column in df.columns : 
    if column !='FloodProbability':
        df[column] = (df[column]-df[column].min())/(df[column].max()-df[column].min())

In [55]:
df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0.0,0.3125,0.444444,0.3125,0.470588,0.352941,0.235294,0.25,0.1875,0.1875,...,0.294118,0.176471,0.1875,0.3125,0.235294,0.388889,0.263158,0.4375,0.1875,0.445
1,8.944896e-07,0.375,0.388889,0.25,0.235294,0.470588,0.470588,0.1875,0.3125,0.25,...,0.411765,0.117647,0.0,0.1875,0.294118,0.166667,0.157895,0.25,0.1875,0.45
2,1.788979e-06,0.375,0.277778,0.375,0.411765,0.176471,0.411765,0.0625,0.3125,0.25,...,0.411765,0.176471,0.4375,0.3125,0.352941,0.444444,0.105263,0.1875,0.1875,0.53
3,2.683469e-06,0.1875,0.222222,0.375,0.294118,0.235294,0.470588,0.25,0.4375,0.375,...,0.117647,0.235294,0.4375,0.25,0.235294,0.333333,0.263158,0.4375,0.3125,0.535
4,3.577958e-06,0.3125,0.166667,0.125,0.352941,0.235294,0.235294,0.1875,0.1875,0.1875,...,0.117647,0.117647,0.375,0.375,0.235294,0.055556,0.105263,0.1875,0.3125,0.415


In [56]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()


In [57]:
df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0.0,0.3125,0.444444,0.3125,0.470588,0.352941,0.235294,0.25,0.1875,0.1875,...,0.294118,0.176471,0.1875,0.3125,0.235294,0.388889,0.263158,0.4375,0.1875,0.445
1,8.944896e-07,0.375,0.388889,0.25,0.235294,0.470588,0.470588,0.1875,0.3125,0.25,...,0.411765,0.117647,0.0,0.1875,0.294118,0.166667,0.157895,0.25,0.1875,0.45
2,1.788979e-06,0.375,0.277778,0.375,0.411765,0.176471,0.411765,0.0625,0.3125,0.25,...,0.411765,0.176471,0.4375,0.3125,0.352941,0.444444,0.105263,0.1875,0.1875,0.53
3,2.683469e-06,0.1875,0.222222,0.375,0.294118,0.235294,0.470588,0.25,0.4375,0.375,...,0.117647,0.235294,0.4375,0.25,0.235294,0.333333,0.263158,0.4375,0.3125,0.535
4,3.577958e-06,0.3125,0.166667,0.125,0.352941,0.235294,0.235294,0.1875,0.1875,0.1875,...,0.117647,0.117647,0.375,0.375,0.235294,0.055556,0.105263,0.1875,0.3125,0.415


In [58]:
df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0.0,0.3125,0.444444,0.3125,0.470588,0.352941,0.235294,0.25,0.1875,0.1875,...,0.294118,0.176471,0.1875,0.3125,0.235294,0.388889,0.263158,0.4375,0.1875,0.445
1,8.944896e-07,0.375,0.388889,0.25,0.235294,0.470588,0.470588,0.1875,0.3125,0.25,...,0.411765,0.117647,0.0,0.1875,0.294118,0.166667,0.157895,0.25,0.1875,0.45
2,1.788979e-06,0.375,0.277778,0.375,0.411765,0.176471,0.411765,0.0625,0.3125,0.25,...,0.411765,0.176471,0.4375,0.3125,0.352941,0.444444,0.105263,0.1875,0.1875,0.53
3,2.683469e-06,0.1875,0.222222,0.375,0.294118,0.235294,0.470588,0.25,0.4375,0.375,...,0.117647,0.235294,0.4375,0.25,0.235294,0.333333,0.263158,0.4375,0.3125,0.535
4,3.577958e-06,0.3125,0.166667,0.125,0.352941,0.235294,0.235294,0.1875,0.1875,0.1875,...,0.117647,0.117647,0.375,0.375,0.235294,0.055556,0.105263,0.1875,0.3125,0.415


In [59]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['FloodProbability'])
y = df['FloodProbability']

In [60]:
X.shape

(1117957, 21)

In [61]:
df.shape

(1117957, 22)

In [62]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.3)

In [63]:
dt.fit(X_train , y_train)
y_pred = dt.predict(X_test)

In [66]:
from sklearn.metrics import r2_score , mean_squared_error
print(r2_score(y_test , y_pred))

0.033002851179616766


In [67]:
print(mean_squared_error(y_test , y_pred))

0.0025138540287666833


In [68]:
test_df

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745300,1863257,5,4,8,3,5,4,4,5,5,...,5,6,1,3,5,6,4,4,6,6
745301,1863258,4,4,2,12,4,3,4,3,5,...,3,7,4,4,3,5,5,3,5,4
745302,1863259,5,7,9,5,5,6,7,5,5,...,6,11,3,11,4,5,9,5,5,4
745303,1863260,4,7,6,3,5,2,3,8,6,...,6,6,8,6,2,3,8,7,5,5


In [69]:
sample = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import numpy as np

# Load data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

# Separate features and target
X = train_data.drop(columns=['FloodProbability'])
y = train_data['FloodProbability']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data)


In [11]:
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

# Initialize K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgbm_r2_scores = []

for train_index, val_index in kf.split(X_scaled):
    X_train_fold, X_val_fold = X_scaled[train_index], X_scaled[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Initialize and train LightGBM model
    lgbm = LGBMRegressor()
    lgbm.fit(X_train_fold, y_train_fold)
    
    # Predict on validation set
    lgbm_val_predictions = lgbm.predict(X_val_fold)
    lgbm_r2 = r2_score(y_val_fold, lgbm_val_predictions)
    lgbm_r2_scores.append(lgbm_r2)

print(f"LightGBM Mean Validation R²: {np.mean(lgbm_r2_scores):.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.151285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 621
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 21
[LightGBM] [Info] Start training from score 0.504480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.153427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 623
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 21
[LightGBM] [Info] Start training from score 0.504511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.157745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 623
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 21
[LightGBM] [Info] Start tra

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
dl_r2_scores = []

for train_index, val_index in kf.split(X_scaled):
    X_train_fold, X_val_fold = X_scaled[train_index], X_scaled[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Build and compile the deep learning model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_fold.shape[1],)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer=Adam(), loss='mean_squared_error')

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=64, validation_data=(X_val_fold, y_val_fold), verbose=1)

    # Predict on validation set
    dl_val_predictions = model.predict(X_val_fold).flatten()
    dl_r2 = r2_score(y_val_fold, dl_val_predictions)
    dl_r2_scores.append(dl_r2)

print(f"Deep Learning Mean Validation R²: {np.mean(dl_r2_scores):.2f}")


In [None]:
# Combine predictions
combined_val_predictions = (lgbm_val_predictions + dl_val_predictions) / 2
combined_rmse = mean_squared_error(y_val, combined_val_predictions, squared=False)
print(f"Combined Validation RMSE: {combined_rmse:.2f}")

# Train both models on the entire training data
lgbm.fit(X, y)
model.fit(scaler.transform(X), y, epochs=50, batch_size=32, verbose=1)

# Make predictions on test data
lgbm_test_predictions = lgbm.predict(test_data_scaled)
dl_test_predictions = model.predict(test_data_scaled).flatten()

# Combine test predictions
combined_test_predictions = (lgbm_test_predictions + dl_test_predictions) / 2

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': combined_test_predictions})
submission.to_csv("combined_model_submission.csv", index=False)
print("Predictions saved for the combined model")


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import numpy as np

# Load data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

# Separate features and target
X = train_data.drop(columns=['FloodProbability'])
y = train_data['FloodProbability']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)


In [16]:
from lightgbm import LGBMRegressor

# Initialize and train LightGBM model
lgbm = LGBMRegressor()
lgbm.fit(X_train_scaled, y_train)

# Predict on validation set
lgbm_val_predictions = lgbm.predict(X_val_scaled)
lgbm_r2 = r2_score(y_val, lgbm_val_predictions)
print(f"LightGBM Validation R²: {lgbm_r2:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.152620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 21
[LightGBM] [Info] Start training from score 0.504480
LightGBM Validation R²: 0.77


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Build and compile the deep learning model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)
])
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=64, validation_data=(X_val_scaled, y_val), verbose=1)

# Predict on validation set
dl_val_predictions = model.predict(X_val_scaled).flatten()
dl_r2 = r2_score(y_val, dl_val_predictions)
print(f"Deep Learning Validation R²: {dl_r2:.2f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - loss: 0.0098 - val_loss: 3.9320e-04
Epoch 2/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 4.4248e-04 - val_loss: 3.8776e-04
Epoch 3/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 4.2254e-04 - val_loss: 3.9074e-04
Epoch 4/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step - loss: 4.2037e-04 - val_loss: 4.4119e-04
Epoch 5/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 4.1738e-04 - val_loss: 3.8642e-04
Epoch 6/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step - loss: 4.1644e-04 - val_loss: 4.4174e-04
Epoch 7/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2ms/step - loss: 4.1481e-04 - val_loss: 4.1463e-04
Epoch 8/10
[1m13975/13975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [20]:
# Combine predictions
combined_val_predictions = (lgbm_val_predictions + dl_val_predictions) / 2
combined_r2 = r2_score(y_val, combined_val_predictions)
print(f"Combined Validation R²: {combined_r2:.2f}")

# Train both models on the entire training data
lgbm.fit(X, y)
model.fit(scaler.transform(X), y, epochs=10, batch_size=32, verbose=1)

# Make predictions on test data
lgbm_test_predictions = lgbm.predict(test_data_scaled)
dl_test_predictions = model.predict(test_data_scaled).flatten()

# Combine test predictions
combined_test_predictions = (lgbm_test_predictions + dl_test_predictions) / 2

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': combined_test_predictions})
submission.to_csv("combined_model_submission.csv", index=False)
print("Predictions saved for the combined model")


Combined Validation R²: 0.82
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.201513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 602
[LightGBM] [Info] Number of data points in the train set: 1117957, number of used features: 21
[LightGBM] [Info] Start training from score 0.504480
Epoch 1/10
[1m34937/34937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - loss: 4.2517e-04
Epoch 2/10
[1m34937/34937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - loss: 4.2456e-04
Epoch 3/10
[1m34937/34937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - loss: 4.2446e-04
Epoch 4/10
[1m34937/34937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - loss: 4.2381e-04
Epoch 5/10
[1m34937/34937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - loss: 4.2336e-04
Epoch 6/10
[1m34937/34937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor

# Load data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

# Separate features and target
X = train_data.drop(columns=['FloodProbability'])
y = train_data['FloodProbability']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],
    'max_depth': [6, 8, 10],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0.0, 0.1, 1.0],
    'reg_lambda': [0.0, 0.1, 1.0]
}

# Initialize the model
lgbm = LGBMRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_lgbm = LGBMRegressor(**best_params)

# Train the model
best_lgbm.fit(X_train_scaled, y_train)

# Predict on validation set
val_predictions = best_lgbm.predict(X_val_scaled)
val_r2 = r2_score(y_val, val_predictions)
print(f"Validation R²: {val_r2:.2f}")

# Make predictions on test data
test_predictions = best_lgbm.predict(test_data_scaled)

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'FloodProbability': test_predictions})
submission.to_csv("best_lgbm_submission.csv", index=False)
print("Predictions saved for the best LightGBM model")


Fitting 3 folds for each of 19683 candidates, totalling 59049 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.336415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 596244, number of used features: 21
[LightGBM] [Info] Start training from score 0.504488
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, min_child_samples=20, n_estimators=100, num_leaves=31, reg_alpha=0.0, reg_lambda=0.0, subsample=0.7; total time=  33.5s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.254496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 621
[LightGBM] [Info] Number of data points in the train set: 596243, number of used features: 21
[LightGBM] [Info] Start training from score 0.504496
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, m