In [1]:
import pandas as pd
import os
import glob

In [4]:
# --- Input and Output Directory ---
base_dir = 'Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/Unbalanced/'

# --- Balancing Function Based on Minimum Total Count ---
def balance_df(df, column):
    balanced_data = []

    # Step 1: Get target total per class based on min count among column's values
    class_counts = df[column].value_counts()
    target_total = class_counts.min()

    for value in class_counts.index:
        subset = df[df[column] == value]

        # Group by Crop_Name
        crop_groups = subset.groupby('Crop_Name')
        crop_sizes = crop_groups.size()

        # Identify small and large crops
        small_crops = crop_sizes[crop_sizes < target_total * 0.15]
        large_crops = crop_sizes[crop_sizes >= target_total * 0.15]

        # Step 2: Add all small crops fully
        collected = [subset[subset['Crop_Name'] == crop] for crop in small_crops.index]
        small_total = sum(len(df_) for df_ in collected)

        # Step 3: Proportionally sample from large crops to fill remaining
        remaining = max(target_total - small_total, 0)
        large_total = large_crops.sum()

        for crop in large_crops.index:
            crop_df = subset[subset['Crop_Name'] == crop]
            prop = large_crops[crop] / large_total
            sample_size = int(prop * remaining)
            sample_size = min(len(crop_df), sample_size)
            collected.append(crop_df.sample(n=sample_size, random_state=42))

        balanced_data.extend(collected)

    return pd.concat(balanced_data, ignore_index=True)

# --- Process All CSVs in the Directory ---
csv_files = glob.glob(os.path.join(base_dir, "*.csv"))

for file_path in csv_files:
    filename = os.path.basename(file_path)
    prefix = filename.split('_')[0]
    sub_dir = os.path.join(base_dir, prefix)
    os.makedirs(sub_dir, exist_ok=True)

    try:
        df = pd.read_csv(file_path)

        for col in ['Height', 'Structure', 'Duration']:
            if col not in df.columns or 'Crop_Name' not in df.columns:
                print(f"❌ Missing {col} or Crop_Name in {filename}, skipping...")
                continue

            balanced_df = balance_df(df, col)
            output_path = os.path.join(sub_dir, f"balanced_on_{col}.csv")
            balanced_df.to_csv(output_path, index=False)

        print(f"✅ {filename} processed and saved to {sub_dir}")

    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")


✅ bidar_merged_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/Unbalanced/bidar
✅ combined_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/Unbalanced/combined
✅ dharwad_merged_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/Unbalanced/dharwad
✅ raichur_merged_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/Unbalanced/raichur
