In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('/kaggle/input/flood-prediction-dataset/flood.csv')

# Preview the data
print(df.head())
print(df.columns)  # Check column names

# Check for missing values
print(df.isnull().sum())

# Handle missing values
df = df.ffill()

# Check again
print(df.isnull().sum())

# Set target column
target = 'FloodProbability'
X = df.drop(columns=[target])
y = df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

# Plot feature importance
feat_importance = pd.Series(model.feature_importances_, index=X.columns)
feat_importance.sort_values().plot(kind='barh', figsize=(10, 8), title='Feature Importance')
plt.tight_layout()
plt.show()


In [None]:
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Flood Probability")
plt.ylabel("Predicted Flood Probability")
plt.title("Actual vs Predicted Flood Probability")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.tight_layout()
plt.show()


In [None]:
import joblib
joblib.dump(model, 'flood_predictor.pkl')
print("Model saved as flood_predictor.pkl")


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
print("Best R^2:", grid_search.best_score_)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Linear Regression model
linear_reg = LinearRegression()

# Train the model
linear_reg.fit(X_train, y_train)

# Make predictions
y_pred_lr = linear_reg.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression MSE: {mse_lr}")
print(f"Linear Regression R^2: {r2_lr}")


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Gradient Boosting Regressor model
gbr = GradientBoostingRegressor(n_estimators=150, max_depth=3, random_state=42)

# Train the model
gbr.fit(X_train, y_train)

# Make predictions
y_pred_gbr = gbr.predict(X_test)

# Evaluate the model
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"Gradient Boosting MSE: {mse_gbr}")
print(f"Gradient Boosting R^2: {r2_gbr}")


In [None]:
print("Linear Regression R²:", r2_lr)
print("Linear Regression MSE:", mse_lr)

print("Gradient Boosting R²:", r2_gbr)
print("Gradient Boosting MSE:", mse_gbr)


In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Linear Regression
cv_scores_lr = cross_val_score(linear_reg, X, y, cv=5)
print("Linear Regression Cross-Validation Scores:", cv_scores_lr)

# Cross-validation for Gradient Boosting
cv_scores_gbr = cross_val_score(gbr, X, y, cv=5)
print("Gradient Boosting Cross-Validation Scores:", cv_scores_gbr)


In [None]:
from sklearn.linear_model import Ridge

# Initialize Ridge Regression
ridge_reg = Ridge(alpha=1.0)

# Train the model
ridge_reg.fit(X_train, y_train)

# Evaluate on cross-validation
cv_scores_ridge = cross_val_score(ridge_reg, X, y, cv=5)
print(f"Ridge Regression Cross-Validation Scores: {cv_scores_ridge}")


In [None]:
# Test set performance for Linear Regression
test_score_lr = linear_reg.score(X_test, y_test)
print(f"Linear Regression Test Set R²: {test_score_lr}")

# Test set performance for Ridge Regression
test_score_ridge = ridge_reg.score(X_test, y_test)
print(f"Ridge Regression Test Set R²: {test_score_ridge}")


In [None]:
from sklearn.metrics import mean_squared_error

# Evaluate MSE on test set for Ridge
test_preds_ridge = ridge_reg.predict(X_test)
test_mse_ridge = mean_squared_error(y_test, test_preds_ridge)
print(f"Ridge Regression Test Set MSE: {test_mse_ridge}")


In [None]:
# Adjust alpha for Lasso Regression
lasso_reg = Lasso(alpha=0.01)
lasso_reg.fit(X_train, y_train)


In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Assume your Ridge model has been trained already, like:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# ✅ Make predictions on the test set
test_preds_ridge = ridge_model.predict(X_test)

# ✅ Calculate Mean Squared Error (optional)
test_mse_ridge = mean_squared_error(y_test, test_preds_ridge)
print(f"Ridge Regression Test Set MSE: {test_mse_ridge}")

# ✅ Plot Actual vs Predicted values
plt.figure(figsize=(8, 5))
plt.scatter(y_test, test_preds_ridge, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Ideal line
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted (Ridge Regression)")
plt.grid(True)
plt.show()


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Create and train the model
gbr_model = GradientBoostingRegressor(n_estimators=150, max_depth=None, random_state=42)
gbr_model.fit(X_train, y_train)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature importances
feature_importances = gbr_model.feature_importances_
features = X_train.columns

# Plot
plt.figure(figsize=(10, 6))
indices = np.argsort(feature_importances)[::-1]
plt.bar(range(len(features)), feature_importances[indices], align="center")
plt.xticks(range(len(features)), features[indices], rotation=45, ha='right')
plt.title("Feature Importance - Gradient Boosting")
plt.tight_layout()
plt.show()


In [None]:
# Predict on test set
y_pred = gbr_model.predict(X_test)

# Plot Actual vs Predicted
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual Flood Risk")
plt.ylabel("Predicted Flood Risk")
plt.title("Actual vs Predicted - Gradient Boosting")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # reference line
plt.show()


In [None]:
from sklearn.linear_model import Lasso

# Initialize Lasso Regression
lasso_reg = Lasso(alpha=0.1)

# Train the model
lasso_reg.fit(X_train, y_train)

# Evaluate on cross-validation
cv_scores_lasso = cross_val_score(lasso_reg, X, y, cv=5)
print(f"Lasso Regression Cross-Validation Scores: {cv_scores_lasso}")


In [None]:
from sklearn.model_selection import GridSearchCV

# Set the parameters for GridSearch
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Initialize the model
gbr = GradientBoostingRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(gbr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_gbr = grid_search.best_estimator_
best_gbr_score = best_gbr.score(X_test, y_test)
print(f"Best Gradient Boosting Test Set R²: {best_gbr_score}")


In [None]:
# Evaluate Linear Regression on test set
test_score_lr = linear_reg.score(X_test, y_test)
print(f"Linear Regression Test Set R²: {test_score_lr}")
