In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# 📁 Path to your directory with CSV files
input_dir = "Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6/"

# 🧺 Store combined data
combined_df = pd.DataFrame()

# 📂 List of all CSV files
csv_files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]

# 🔄 Iterate through files
for file in csv_files:
    file_path = os.path.join(input_dir, file)
    df = pd.read_csv(file_path)

    # 🧪 80-20 split per Crop_Name group
    train_list, test_list = [], []
    for crop, group in df.groupby("Crop_Name"):
        if len(group) >= 5:  # avoid tiny groups failing stratification
            train, test = train_test_split(group, test_size=0.2, random_state=42, stratify=None)
            train_list.append(train)
            test_list.append(test)
        else:
            # If too small, assign all to train
            train_list.append(group)

    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)

    # 💾 Save split files
    base_name = os.path.splitext(file)[0]
    train_df.to_csv(os.path.join(input_dir, f"{base_name}_train.csv"), index=False)
    test_df.to_csv(os.path.join(input_dir, f"{base_name}_test.csv"), index=False)

    # 📥 Add to combined dataset
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# 🧪 Combined 80-20 split across all files stratified by Crop_Name
combined_train, combined_test = train_test_split(
    combined_df, test_size=0.2, random_state=42, stratify=combined_df["Crop_Name"]
)

# 💾 Save combined splits
combined_train.to_csv(os.path.join(input_dir, "combined_train.csv"), index=False)
combined_test.to_csv(os.path.join(input_dir, "combined_test.csv"), index=False)

print("✅ Done: Per-file and combined splits saved.")


✅ Done: Per-file and combined splits saved.
