In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import yaml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------------- Paths ----------------
csv_path = r"C:\Users\sagni\Downloads\Flood Detection\archive (1)\modis_flood_features_paling cleaning (1).csv"
output_dir = r"C:\Users\sagni\Downloads\Flood Detection"

# ---------------- Load Dataset ----------------
df = pd.read_csv(csv_path)

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ---------------- Save Dataset in Multiple Formats ----------------
df.to_hdf(os.path.join(output_dir, "flood_data.h5"), key="flood_data", mode="w")
df.to_pickle(os.path.join(output_dir, "flood_data.pkl"))
with open(os.path.join(output_dir, "flood_data.yaml"), "w") as yaml_file:
    yaml.dump(df.to_dict(orient="records"), yaml_file, default_flow_style=False)
df.to_csv(os.path.join(output_dir, "flood_data.txt"), sep="\t", index=False)

print("✅ Dataset saved in H5, PKL, YAML, and TXT formats.")

# ---------------- Data Preparation ----------------
target_column = "target"
feature_columns = [col for col in df.columns if col not in [target_column, "date"]]  # keep lat/lon for reference
X = df[feature_columns].drop(columns=["lat", "lon"])  # remove lat/lon from model training
y = df[target_column]

# Store lat/lon separately for mapping later
coords = df[["lat", "lon"]]

# ---------------- Train-Test Split ----------------
X_train, X_test, y_train, y_test, coords_train, coords_test = train_test_split(
    X, y, coords, test_size=0.2, random_state=42
)

# ---------------- Train Model ----------------
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# ---------------- Predictions & Accuracy ----------------
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Final Accuracy: {accuracy:.4f}")

# ---------------- Heatmap ----------------
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=False, cmap="coolwarm")
plt.title("Feature Correlation Heatmap", fontsize=16)
heatmap_path = os.path.join(output_dir, "heatmap.png")
plt.savefig(heatmap_path, dpi=300)
plt.close()

# ---------------- Accuracy Graph ----------------
train_sizes = [0.1, 0.2, 0.4, 0.6, 0.8]
acc_scores = []
for size in train_sizes:
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=size, random_state=42)
    model.fit(X_tr, y_tr)
    y_pred_temp = model.predict(X_te)
    acc_scores.append(accuracy_score(y_te, y_pred_temp))

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, acc_scores, marker="o", color="b", linewidth=2)
plt.xlabel("Training Size")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Training Size")
plt.grid(True)
accuracy_graph_path = os.path.join(output_dir, "accuracy_graph.png")
plt.savefig(accuracy_graph_path, dpi=300)
plt.close()

print(f"✅ Heatmap saved at: {heatmap_path}")
print(f"✅ Accuracy graph saved at: {accuracy_graph_path}")

# ---------------- Extract Flooded Locations ----------------
flooded_coords = coords_test[y_pred == 1]  # locations predicted as flooded
flooded_coords.reset_index(drop=True, inplace=True)

# Save to CSV
flooded_coords_path = os.path.join(output_dir, "flooded_locations.csv")
flooded_coords.to_csv(flooded_coords_path, index=False)
print(f"✅ Flooded locations saved at: {flooded_coords_path}")

# Show top 10 flooded locations
print("\n🔍 Sample Flooded Locations:")
print(flooded_coords.head(10))


Dataset Shape: (1025801, 16)
Columns: ['date', 'lon', 'lat', 'flooded', 'jrc_perm_water', 'precip_1d', 'precip_3d', 'NDVI', 'NDWI', 'landcover', 'elevation', 'slope', 'aspect', 'upstream_area', 'TWI', 'target']
✅ Dataset saved in H5, PKL, YAML, and TXT formats.
✅ Final Accuracy: 1.0000
✅ Heatmap saved at: C:\Users\sagni\Downloads\Flood Detection\heatmap.png
✅ Accuracy graph saved at: C:\Users\sagni\Downloads\Flood Detection\accuracy_graph.png
✅ Flooded locations saved at: C:\Users\sagni\Downloads\Flood Detection\flooded_locations.csv

🔍 Sample Flooded Locations:
        lat         lon
0 -5.804240  120.460711
1 -4.036804  119.986850
2 -3.848158  119.845365
3 -4.104178  120.325964
4 -4.881221  119.557904
5 -4.090703  120.013799
6 -2.635432  121.035633
7 -3.780784  119.508497
8 -2.599500  121.410679
9 -4.187272  120.375371
