In [2]:
import pandas as pd
import os
import glob

# Downsampling on all the Variables

In [8]:

# Load dataset
df = pd.read_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/combined_train.csv")

# Group by Structure, Height, and Duration
group_cols = ['Structure', 'Height', 'Duration']
grouped = df.groupby(group_cols)

# Get sizes of each group
group_sizes = grouped.size()
median_size = group_sizes.median()
upper_limit = median_size * 1.3  # Allow 30% variation

print(f"Median group size: {median_size}")
print(f"Upper limit for downsampling: {int(upper_limit)}")

# Controlled downsampling
def controlled_downsample(group):
    size = len(group)
    if size > upper_limit:
        return group.sample(n=int(upper_limit), random_state=42)
    return group

balanced_df = grouped.apply(controlled_downsample).reset_index(drop=True)

# Save or return
balanced_df.to_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/Combined_Train_Balanced.csv", index=False)


Median group size: 4087.5
Upper limit for downsampling: 5313


# Downsampling on a Specific Column

In [10]:


# Load the dataset
df = pd.read_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2018_19/AEZ_6/bidar_merged.csv")

# Define the target column
target_col = "Structure"

# Group by Structure
grouped = df.groupby(target_col)
group_sizes = grouped.size()
min_size = group_sizes.min()
max_allowed = int(min_size * 1.5)

print(group_sizes)
print(f"Minimum group size: {min_size}")
print(f"Maximum allowed: {max_allowed}")

# Controlled downsampling
def downsample(group):
    if len(group) > max_allowed:
        return group.sample(n=max_allowed, random_state=42)
    return group

balanced_df = grouped.apply(downsample).reset_index(drop=True)

# Check new distribution
print(balanced_df['Structure'].value_counts())

# Save the balanced dataset
balanced_df.to_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/Bidar_Structure", index=False)


Structure
Grass    16599
Root      2777
Shrub    11035
Tree      5313
dtype: int64
Minimum group size: 2777
Maximum allowed: 4165
Structure
Grass    4165
Shrub    4165
Tree     4165
Root     2777
Name: count, dtype: int64


In [6]:
import os
import glob
import pandas as pd

# --- Input and Output Directory ---
base_dir = 'Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/'  # Set your directory here

# --- Balancing Function ---
def balance_df(df, column):
    group_counts = df[column].value_counts()
    min_count = group_counts.min()
    threshold = int(min_count * 1.3)

    balanced_df = pd.concat([
        group.sample(n=min(len(group), threshold), random_state=42)
        for _, group in df.groupby(column)
    ])
    return balanced_df

# --- Process All CSVs in the Directory ---
csv_files = glob.glob(os.path.join(base_dir, "*.csv"))

for file_path in csv_files:
    filename = os.path.basename(file_path)
    prefix = filename.split('_')[0]
    sub_dir = os.path.join(base_dir, prefix)
    os.makedirs(sub_dir, exist_ok=True)

    try:
        df = pd.read_csv(file_path)

        for col in ['Height', 'Structure', 'Duration']:
            if col not in df.columns:
                print(f"Column {col} not found in {filename}, skipping...")
                continue

            balanced_df = balance_df(df, col)
            output_path = os.path.join(sub_dir, f"balanced_on_{col}.csv")
            balanced_df.to_csv(output_path, index=False)

        print(f"✅ {filename} processed and saved to {sub_dir}")

    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")


✅ bidar_merged_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/bidar
✅ combined_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/combined
✅ dharwad_merged_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/dharwad
✅ raichur_merged_train.csv processed and saved to Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/raichur
