In [1]:
# =======================
# URBAN FLOOD RISK ANALYSIS
# =======================

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

# 2. Load Dataset
df = pd.read_csv(r"C:\Users\lenovo\Desktop\urban_pluvial_flood_risk_dataset.csv")  # replace with your file path
print("Dataset Preview:")
display(df.head())

# 3. Explore Data
print("Dataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())
print("\nStatistics:")
display(df.describe())

# 4. Handle Missing Values
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# 5. Encode Categorical Variables
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# 6. Correlation Heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# 7. Feature Importance using Random Forest
# Assuming target column is 'flood_risk' (adjust if different)
X = df.drop('risk_labels', axis=1)
y = df['risk_labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

feat_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top 10 Features Contributing to Floods:")
display(feat_importances.head(10))

# 8. Plot Top Factors
plt.figure(figsize=(10,6))
sns.barplot(x=feat_importances.head(10), y=feat_importances.head(10).index, palette='viridis')
plt.title("Top 10 Factors Contributing to Floods")
plt.xlabel("Importance")
plt.ylabel("Factors")
plt.show()

# 9. Identify High-Risk Areas
# Assuming a column 'area_type' exists
area_risk = df.groupby('soil_group')['risk_labels'].mean().sort_values(ascending=False)
print("Average Flood Risk by Area Type:")
display('soil_group')

plt.figure(figsize=(8,5))
sns.barplot(x=area_risk.values, y=area_risk.index, palette='magma')
plt.title("Average Flood Risk by Area Type")
plt.xlabel("Average Flood Risk")
plt.ylabel("Area Type")
plt.show()

# 10. Policy Suggestions
top_factors = feat_importances.head(5).index.tolist()
print("\n📌 Key Factors Contributing to Floods:")
for f in top_factors:
    print("-", f)

print("\n📌 High-Risk Area Types:")
for area in area_risk.index[:5]:
    print("-", area)

print("\n💡 Policy Suggestions for Policymakers:")
print(f"""
1. Focus flood prevention measures on high-risk areas: {', '.join(map(str, area_risk.index[:3]))}.
2. Improve drainage, infrastructure, and early warning systems considering key factors: {', '.join(map(str, top_factors))}.
3. Implement land use planning and restrict construction in the most vulnerable zones.
4. Promote public awareness and community-level flood preparedness programs.
""")

ModuleNotFoundError: No module named 'sklearn'