In [None]:
import pandas as pd

# Downsampling on all the Variables

In [2]:

# Load dataset
df = pd.read_csv("Karnataka_Datasets/Across/Train_Test_Datasets/Combined_Train.csv")

# Group by Structure, Height, and Duration
group_cols = ['Structure', 'Height', 'Duration']
grouped = df.groupby(group_cols)

# Get sizes of each group
group_sizes = grouped.size()
median_size = group_sizes.median()
upper_limit = median_size * 1.3  # Allow 30% variation

print(f"Median group size: {median_size}")
print(f"Upper limit for downsampling: {int(upper_limit)}")

# Controlled downsampling
def controlled_downsample(group):
    size = len(group)
    if size > upper_limit:
        return group.sample(n=int(upper_limit), random_state=42)
    return group

balanced_df = grouped.apply(controlled_downsample).reset_index(drop=True)

# Save or return
balanced_df.to_csv("Karnataka_Datasets/Across/Train_Test_Datasets/Combined_Train_Balanced.csv", index=False)


Median group size: 4379.0
Upper limit for downsampling: 5692


# Downsampling on a Specific Column

In [4]:


# Load the dataset
df = pd.read_csv("Karnataka_Datasets/Across/Train_Test_Datasets/Combined_Train_Balanced.csv")

# Define the target column
target_col = "Structure"

# Group by Structure
grouped = df.groupby(target_col)
group_sizes = grouped.size()
min_size = group_sizes.min()
max_allowed = int(min_size * 1.3)

print(group_sizes)
print(f"Minimum group size: {min_size}")
print(f"Maximum allowed: {max_allowed}")

# Controlled downsampling
def downsample(group):
    if len(group) > max_allowed:
        return group.sample(n=max_allowed, random_state=42)
    return group

balanced_df = grouped.apply(downsample).reset_index(drop=True)

# Check new distribution
print(balanced_df['Structure'].value_counts())

# Save the balanced dataset
balanced_df.to_csv("Karnataka_Datasets/Across/Train_Test_Datasets/Combined_Train_Balanced.csv", index=False)


Structure
Grass Crop    21319
Shrub          8174
Tree           5692
dtype: int64
Minimum group size: 5692
Maximum allowed: 7399
Structure
Grass Crop    7399
Shrub         7399
Tree          5692
Name: count, dtype: int64
