In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import yaml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------------- Paths ----------------
csv_path = r"C:\Users\sagni\Downloads\Flood Detection\archive (1)\modis_flood_features_paling cleaning (1).csv"
output_dir = r"C:\Users\sagni\Downloads\Flood Detection"

# ---------------- Load Dataset ----------------
df = pd.read_csv(csv_path)

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ---------------- Save Dataset in Multiple Formats ----------------
# Save as HDF5
h5_path = os.path.join(output_dir, "flood_data.h5")
df.to_hdf(h5_path, key="flood_data", mode="w")

# Save as Pickle
pkl_path = os.path.join(output_dir, "flood_data.pkl")
df.to_pickle(pkl_path)

# Save as YAML
yaml_path = os.path.join(output_dir, "flood_data.yaml")
with open(yaml_path, "w") as yaml_file:
    yaml.dump(df.to_dict(orient="records"), yaml_file, default_flow_style=False)

# Save as TXT
txt_path = os.path.join(output_dir, "flood_data.txt")
df.to_csv(txt_path, sep="\t", index=False)

print("✅ Dataset saved in H5, PKL, YAML, and TXT formats.")

# ---------------- Data Preparation ----------------
target_column = "target"
X = df.drop(columns=[target_column, "date"])  # drop non-numeric date column
y = df[target_column]

# ---------------- Train-Test Split ----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------- Train Model ----------------
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# ---------------- Predictions & Accuracy ----------------
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Final Accuracy: {accuracy:.4f}")

# ---------------- Heatmap ----------------
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=False, cmap="coolwarm")
plt.title("Feature Correlation Heatmap", fontsize=16)
heatmap_path = os.path.join(output_dir, "heatmap.png")
plt.savefig(heatmap_path, dpi=300)
plt.close()

# ---------------- Accuracy Graph ----------------
train_sizes = [0.1, 0.2, 0.4, 0.6, 0.8]
acc_scores = []

for size in train_sizes:
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, train_size=size, random_state=42
    )
    model.fit(X_tr, y_tr)
    y_pred_temp = model.predict(X_te)
    acc_scores.append(accuracy_score(y_te, y_pred_temp))

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, acc_scores, marker="o", color="b", linewidth=2)
plt.xlabel("Training Size")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Training Size")
plt.grid(True)
accuracy_graph_path = os.path.join(output_dir, "accuracy_graph.png")
plt.savefig(accuracy_graph_path, dpi=300)
plt.close()

print(f"✅ Heatmap saved at: {heatmap_path}")
print(f"✅ Accuracy graph saved at: {accuracy_graph_path}")

# ---------------- Sample Prediction ----------------
# Take first 5 samples from test set
sample_input = X_test.head()
sample_prediction = model.predict(sample_input)

# Combine input and prediction
prediction_df = sample_input.copy()
prediction_df["Predicted_Flood"] = sample_prediction
print("\n🔍 Sample Predictions:")
print(prediction_df)

# Save predictions to CSV
predictions_path = os.path.join(output_dir, "sample_predictions.csv")
prediction_df.to_csv(predictions_path, index=False)
print(f"✅ Sample predictions saved at: {predictions_path}")


Dataset Shape: (1025801, 16)
Columns: ['date', 'lon', 'lat', 'flooded', 'jrc_perm_water', 'precip_1d', 'precip_3d', 'NDVI', 'NDWI', 'landcover', 'elevation', 'slope', 'aspect', 'upstream_area', 'TWI', 'target']
✅ Dataset saved in H5, PKL, YAML, and TXT formats.
✅ Final Accuracy: 1.0000
✅ Heatmap saved at: C:\Users\sagni\Downloads\Flood Detection\heatmap.png
✅ Accuracy graph saved at: C:\Users\sagni\Downloads\Flood Detection\accuracy_graph.png

🔍 Sample Predictions:
               lon       lat  flooded  jrc_perm_water  precip_1d   precip_3d  \
970389  120.179987 -5.543728      0.0             0.0   0.000000    0.000000   
737426  120.316980 -4.270366      0.0             0.0   9.481877   24.978163   
816063  119.405190 -5.411227      0.0             0.0  46.054840  120.380428   
433181  121.529706 -2.792638      0.0             1.0  19.889261   39.778522   
544074  119.506251 -3.372051      0.0             0.0   0.000000   12.783460   

               NDVI      NDWI  landcover  elevati