In [1]:
import pandas as pd
import os
import glob

In [7]:
# 📄 Path to your input CSV file
input_file = 'Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/combined_train.csv'

# 📤 Path to save the output
output_file = input_file

# 🔃 Load CSV
df = pd.read_csv(input_file)

# ❌ Remove rows where Structure is 'Herb'
df_filtered = df[df['Structure'] != 'Herb']

# 💾 Save the filtered file
df_filtered.to_csv(output_file, index=False)

print("✅ Rows with Structure = 'Herb' removed and file saved.")


✅ Rows with Structure = 'Herb' removed and file saved.


# Downsampling on all the Variables

In [8]:

# Load dataset
df = pd.read_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/combined_train.csv")

# Group by Structure, Height, and Duration
group_cols = ['Structure', 'Height', 'Duration']
grouped = df.groupby(group_cols)

# Get sizes of each group
group_sizes = grouped.size()
median_size = group_sizes.median()
upper_limit = median_size * 1.3  # Allow 30% variation

print(f"Median group size: {median_size}")
print(f"Upper limit for downsampling: {int(upper_limit)}")

# Controlled downsampling
def controlled_downsample(group):
    size = len(group)
    if size > upper_limit:
        return group.sample(n=int(upper_limit), random_state=42)
    return group

balanced_df = grouped.apply(controlled_downsample).reset_index(drop=True)

# Save or return
balanced_df.to_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/Combined_Train_Balanced.csv", index=False)


Median group size: 4087.5
Upper limit for downsampling: 5313


# Downsampling on a Specific Column

In [10]:


# Load the dataset
df = pd.read_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/Combined_Train_Balanced.csv")

# Define the target column
target_col = "Structure"

# Group by Structure
grouped = df.groupby(target_col)
group_sizes = grouped.size()
min_size = group_sizes.min()
max_allowed = int(min_size * 1.5)

print(group_sizes)
print(f"Minimum group size: {min_size}")
print(f"Maximum allowed: {max_allowed}")

# Controlled downsampling
def downsample(group):
    if len(group) > max_allowed:
        return group.sample(n=max_allowed, random_state=42)
    return group

balanced_df = grouped.apply(downsample).reset_index(drop=True)

# Check new distribution
print(balanced_df['Structure'].value_counts())

# Save the balanced dataset
balanced_df.to_csv("Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2018/Combined_Train_Balanced.csv", index=False)


Structure
Grass    16599
Root      2777
Shrub    11035
Tree      5313
dtype: int64
Minimum group size: 2777
Maximum allowed: 4165
Structure
Grass    4165
Shrub    4165
Tree     4165
Root     2777
Name: count, dtype: int64


In [3]:
# Path to the directory containing CSV files
input_dir = 'Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6/'  # ← change this to your actual path
all_files = glob.glob(os.path.join(input_dir, '*.csv'))

for file_path in all_files:
    try:
        df = pd.read_csv(file_path)

        if 'Crop_Name' not in df.columns:
            print(f"⚠️ Skipping {file_path}: 'Crop_Name' column not found.")
            continue

        # Count occurrences of each crop
        crop_counts = df['Crop_Name'].value_counts()

        # Filter crops with at least 250 rows
        valid_crops = crop_counts[crop_counts >= 250].index
        filtered_df = df[df['Crop_Name'].isin(valid_crops)]

        # Save in place
        filtered_df.to_csv(file_path, index=False)
        print(f"✅ Processed and saved: {file_path}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")

✅ Processed and saved: Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6\bidar_merged.csv
✅ Processed and saved: Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6\bijapura_merged.csv
✅ Processed and saved: Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6\dharwad_merged.csv
✅ Processed and saved: Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6\raichur_merged.csv


In [4]:
import pandas as pd

# Load the CSV
input_path = "Karnataka_Datasets/Across/Kharif/2020-21/SAR_GCVI/AEZ_6/Final/Train/Train.csv"
df = pd.read_csv(input_path)

# Define allowed crop names
allowed_crops = [
    "Cotton", "Ginger", "Greengram", "Jowar", "Maize",
    "Paddy", "Redgram", "Soybean", "Sugarcane", "Urad"
]

# Filter rows
filtered_df = df[df['Crop_Name'].isin(allowed_crops)]

# Save the result

filtered_df.to_csv(input_path, index=False)

print("✅ Filtered CSV saved successfully.")


✅ Filtered CSV saved successfully.


In [6]:
import pandas as pd

# Load the CSV
input_path = "Karnataka_Datasets/Across/Kharif/2020-21/SAR_GCVI/AEZ_6/Final/Test/Bidar_Data_Final.csv"
df = pd.read_csv(input_path)

# Define the crop name to remove
crop_to_remove = "Ginger"  # 🔁 change as needed

# Filter out the rows
filtered_df = df[df['Crop_Name'] != crop_to_remove]

# Save the cleaned file
filtered_df.to_csv(input_path, index=False)

print(f"✅ Rows with Crop_Name = '{crop_to_remove}' removed and file saved.")


✅ Rows with Crop_Name = 'Ginger' removed and file saved.


In [7]:
import pandas as pd

# Load your CSV
input_file="Karnataka_Datasets/Across/Kharif/2020-21/SAR_GCVI/AEZ_6/Final/Train/Train.csv"
df = pd.read_csv(input_file)

# Split the data by Height
medium_df = df[df['Height'] == 'Medium']
other_df = df[df['Height'] != 'Medium']

# Desired downsample size
target_size = 11500

# Stratified sampling of "Medium" height crops by Crop_Name
medium_sampled = (
    medium_df.groupby("Crop_Name", group_keys=False)
    .apply(lambda x: x.sample(frac=min(1, target_size / len(medium_df) * len(x) / len(x)), random_state=42))
    .reset_index(drop=True)
)

# Combine back with other height categories
balanced_df = pd.concat([other_df, medium_sampled], ignore_index=True)

# Save result
balanced_df.to_csv(input_file, index=False)

print("✅ Downsampling complete and saved.")


✅ Downsampling complete and saved.


In [8]:
import pandas as pd

# Load your CSV
df = pd.read_csv(input_file)

# Split the data by Duration
long_df = df[df['Duration'] == 'Long']
other_df = df[df['Duration'] != 'Long']

# Desired downsample size
target_size = 7500

# Stratified sampling for 'Long' Duration by Crop_Name
long_sampled = (
    long_df.groupby("Crop_Name", group_keys=False)
    .apply(lambda x: x.sample(frac=min(1, target_size / len(long_df)), random_state=42))
    .reset_index(drop=True)
)

# Combine back with 'Low' and 'Medium'
balanced_df = pd.concat([other_df, long_sampled], ignore_index=True)

# Save the result
balanced_df.to_csv(input_file, index=False)

print("✅ Duration balanced: 'Long' downsampled to ~7500 while preserving Crop_Name distribution.")


✅ Duration balanced: 'Long' downsampled to ~7500 while preserving Crop_Name distribution.


In [9]:
# Define limits
height_limit = 5000
duration_limit = 7500

# Function to downsample a group while preserving Crop_Name distribution
def stratified_sample(group_df, limit, level_name):
    if len(group_df) <= limit:
        return group_df
    return (
        group_df.groupby("Crop_Name", group_keys=False)
        .apply(lambda x: x.sample(frac=min(1, limit / len(group_df)), random_state=42))
        .reset_index(drop=True)
    )

# Step 1: Downsample each Height group
height_balanced = (
    df.groupby('Height', group_keys=False)
    .apply(lambda g: stratified_sample(g, height_limit, 'Height'))
)

# Step 2: From the height-balanced data, downsample each Duration group
final_balanced = (
    height_balanced.groupby('Duration', group_keys=False)
    .apply(lambda g: stratified_sample(g, duration_limit, 'Duration'))
)

# Save the final balanced dataset
final_balanced.to_csv(input_file, index=False)

In [10]:
import pandas as pd

# Load already height-balanced CSV

# Target duration size
target_duration = 3300

# Stratified sampling by Crop_Name within each Duration
def stratified_duration_sample(group):
    if len(group) <= target_duration:
        return group
    return (
        group.groupby("Crop_Name", group_keys=False)
        .apply(lambda x: x.sample(frac=min(1, target_duration / len(group)), random_state=42))
        .reset_index(drop=True)
    )

# Apply sampling to each Duration group
duration_balanced = (
    df.groupby("Duration", group_keys=False)
    .apply(stratified_duration_sample)
)

# Save final result
duration_balanced.to_csv(input_file, index=False)

print("✅ Final duration balancing complete. All Duration categories now have ~3300 rows.")


✅ Final duration balancing complete. All Duration categories now have ~3300 rows.


In [19]:
import pandas as pd
import os

# 📁 Paths
input_path = "Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/combined_train.csv"
output_dir = "Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/"

# 📂 Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# 🧠 Function to balance by a single column using min_size * factor
def balance_by_column(df, column, factor=1.3):
    grouped = df.groupby(column)
    group_sizes = grouped.size()
    min_size = group_sizes.min()
    upper_limit = int(min_size * factor)

    print(f"\nBalancing by: {column}")
    print(f"Minimum group size: {min_size}")
    print(f"Upper limit (min_size * {factor}): {upper_limit}")

    def controlled_downsample(group):
        if len(group) > upper_limit:
            return group.sample(n=upper_limit, random_state=42)
        return group

    balanced = grouped.apply(controlled_downsample).reset_index(drop=True)
    return balanced

# 📄 Load data
df = pd.read_csv(input_path)

# 📦 Balance by Height
height_balanced = balance_by_column(df, 'Height', factor=1.3)
height_file = os.path.join(output_dir, "Height_Balanced.csv")
height_balanced.to_csv(height_file, index=False)

# 📦 Balance by Duration
duration_balanced = balance_by_column(df, 'Duration', factor=1.3)
duration_file = os.path.join(output_dir, "Duration_Balanced.csv")
duration_balanced.to_csv(duration_file, index=False)

# 📦 Balance by Structure
structure_balanced = balance_by_column(df, 'Structure', factor=1.3)
structure_file = os.path.join(output_dir, "Structure_Balanced.csv")
structure_balanced.to_csv(structure_file, index=False)

print(f"\n✅ Files saved in: {output_dir}")



Balancing by: Height
Minimum group size: 15711
Upper limit (min_size * 1.3): 20424

Balancing by: Duration
Minimum group size: 19029
Upper limit (min_size * 1.3): 24737

Balancing by: Structure
Minimum group size: 1580
Upper limit (min_size * 1.3): 2054

✅ Files saved in: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Train/2021/
