In [5]:
import os
import pandas as pd

input_dir = "Karnataka_Datasets/Across/SAR/"
output_file = os.path.join(input_dir, "SAR_Data.csv")


# Read and concatenate all CSV files
dfs = []
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Combine all into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Parse dates using mixed format
combined_df['date'] = pd.to_datetime(combined_df['date'], format='mixed', dayfirst=True, errors='coerce')
combined_df = combined_df.dropna(subset=['date'])

# Sort by date to maintain time order
combined_df = combined_df.sort_values(by=['Latitude', 'Longitude', 'date'])

# Assign time index within each group
combined_df['time_index'] = combined_df.groupby(['Latitude', 'Longitude']).cumcount() + 1

# Get the most frequent Crop_Name for each (Latitude, Longitude)
crop_name_df = combined_df.groupby(['Latitude', 'Longitude'])['Crop_Name'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0])
crop_name_df = crop_name_df.reset_index()

# Pivot VH
vh_pivot = combined_df.pivot(index=['Latitude', 'Longitude'], columns='time_index', values='VH')
vh_pivot.columns = [f'VH_{i}' for i in vh_pivot.columns]

# Pivot VV
vv_pivot = combined_df.pivot(index=['Latitude', 'Longitude'], columns='time_index', values='VV')
vv_pivot.columns = [f'VV_{i}' for i in vv_pivot.columns]

# Merge all parts
result = pd.concat([crop_name_df.set_index(['Latitude', 'Longitude']), vh_pivot, vv_pivot], axis=1).reset_index()

# Sort VH and VV columns numerically
vh_columns = sorted([col for col in result.columns if col.startswith('VH_')], key=lambda x: int(x.split('_')[1]))
vv_columns = sorted([col for col in result.columns if col.startswith('VV_')], key=lambda x: int(x.split('_')[1]))

# Final column order
final_columns = ['Latitude', 'Longitude', 'Crop_Name'] + vh_columns + vv_columns
result = result[final_columns]

# Save the result
result.to_csv(output_file, index=False)
print(f"Flattened time series with Crop_Name saved to: {output_file}")


Flattened time series with Crop_Name saved to: Karnataka_Datasets/Across/SAR/SAR_Data.csv


In [8]:
import pandas as pd

# Read the CSV file into a DataFrame
input_file = "Karnataka_Datasets/Across/SAR/SAR_Data_Clean.csv"
df = pd.read_csv(input_file)

# Identify all VV and VH columns (from VV_1 to VV_31 and VH_1 to VH_31)
vv_columns = [f'VV_{i}' for i in range(1, 32)]
vh_columns = [f'VH_{i}' for i in range(1, 32)]

# Combine all VV and VH columns into a list
all_columns = vv_columns + vh_columns

# Get the unique count of each column, then filter out those with only one unique value
columns_to_keep = [col for col in all_columns if df[col].nunique() > 1]

# Keep only the original columns (including Crop_Name, Latitude, and Longitude)
df_cleaned = df[['Latitude', 'Longitude', 'Crop_Name'] + columns_to_keep]

# Save the cleaned DataFrame back to a CSV file

# Save the cleaned DataFrame back to a CSV file
output_file = "Karnataka_Datasets/Across/SAR/SAR_Data_Clean.csv"
df_cleaned.to_csv(output_file, index=False)
print(f"Cleaned data saved to: {output_file}")


Cleaned data saved to: Karnataka_Datasets/Across/SAR/SAR_Data_Clean.csv


In [10]:
import pandas as pd

# Read both CSV files into DataFrames
file2 = "Karnataka_Datasets/Across/Merged_Karnataka_S2_Kharif_Mapped_Clean.csv"  # File with the VV, VH columns
file1 = "Karnataka_Datasets/Across/SAR/SAR_Data_Clean.csv"  # File with Structure and Structure_Numeric columns

df1 = pd.read_csv(file1)  # First file
df2 = pd.read_csv(file2)  # Second file

# Merge the DataFrames based on Latitude and Longitude
merged_df = pd.merge(df1, df2[['Latitude', 'Longitude', 'Structure', 'Structure_Numeric']], 
                     on=['Latitude', 'Longitude'], how='inner')

# Save the merged DataFrame back to a new CSV file
output_file = "Karnataka_Datasets/Across/SAR/SAR_Mapped.csv"
merged_df.to_csv(output_file, index=False)

print(f"Merged data saved to: {output_file}")


Merged data saved to: Karnataka_Datasets/Across/SAR/SAR_Mapped.csv


In [22]:
# Print the count of each unique value in the 'Crop_Name' column
print(merged_df['Structure_Numeric'].value_counts())

# If you're referring to a different column, for example, 'Class':
# print(merged_df['Class'].value_counts())


Structure_Numeric
3    46564
1    17055
2    10186
4      120
Name: count, dtype: int64


In [25]:
import pandas as pd
from sklearn.utils import resample

def balance_on_structure_numeric(csv_path, output_path=None):
    # Load CSV
    df = pd.read_csv(csv_path)

    # Step 1: Remove rows where Structure_Numeric == 4
    df = df[df['Structure_Numeric'] != 4]

    # Prepare final balanced dataframe
    balanced_df = pd.DataFrame()

    # Step 2: Loop over each unique Crop_Name
    for crop in df['Crop_Name'].unique():
        crop_df = df[df['Crop_Name'] == crop]

        # Get the class distribution for this crop
        class_counts = crop_df['Structure_Numeric'].value_counts()

        # If there's only one class left, skip (nothing to balance)
        if len(class_counts) < 2:
            continue

        # Find the minimum class count (for undersampling)
        min_count = class_counts.min()

        # Undersample each class to the minimum count
        sampled_list = []
        for structure_value in class_counts.index:
            class_subset = crop_df[crop_df['Structure_Numeric'] == structure_value]
            sampled = resample(class_subset, 
                               replace=False, 
                               n_samples=min_count, 
                               random_state=42)
            sampled_list.append(sampled)

        # Combine and add to final DataFrame
        balanced_crop_df = pd.concat(sampled_list)
        balanced_df = pd.concat([balanced_df, balanced_crop_df])

    # Shuffle the final dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save if output path is provided
    if output_path:
        balanced_df.to_csv(output_path, index=False)

    return balanced_df


In [27]:
import pandas as pd

df = pd.read_csv("Karnataka_Datasets/Across/SAR/SAR_Mapped.csv")

# Print the count of each unique value in Structure_Numeric
print(df['Structure_Numeric'].value_counts())


Structure_Numeric
3    46535
1    17079
2    10190
4      120
5        1
Name: count, dtype: int64


In [30]:
import pandas as pd
from sklearn.utils import resample

# Load CSV
df = pd.read_csv("Karnataka_Datasets/Across/SAR/SAR_Mapped.csv")

# Remove Structure_Numeric == 4 or 5
df = df[~df['Structure_Numeric'].isin([4, 5])]

# Separate classes
df_3 = df[df['Structure_Numeric'] == 3]
df_1 = df[df['Structure_Numeric'] == 1]
df_2 = df[df['Structure_Numeric'] == 2]

# Downsample class 3 to 13,000
df_3_downsampled = resample(df_3, 
                            replace=False, 
                            n_samples=13000, 
                            random_state=42)

# Combine all
balanced_df = pd.concat([df_3_downsampled, df_1, df_2])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Final check
print("Final counts:\n", balanced_df['Structure_Numeric'].value_counts())
print("\nFinal total rows:", len(balanced_df))

# Optional: Save
final_df.to_csv("Karnataka_Datasets/Across/SAR/Karnataka_SAR_Balanced.csv", index=False)


Final counts:
 Structure_Numeric
1    17079
3    13000
2    10190
Name: count, dtype: int64

Final total rows: 40269


In [31]:
from sklearn.model_selection import train_test_split

# Assuming `balanced_df` is your DataFrame from before

# Stratified 80-20 split based on Crop_Name
train_df, test_df = train_test_split(
    balanced_df,
    test_size=0.2,
    stratify=balanced_df['Crop_Name'],
    random_state=42
)

# Confirm split
print("Train Crop_Name distribution:\n", train_df['Crop_Name'].value_counts(normalize=True))
print("\nTest Crop_Name distribution:\n", test_df['Crop_Name'].value_counts(normalize=True))

# Optional: Save
train_df.to_csv("Karnataka_Datasets/Across/SAR/Karnataka_SAR_train.csv", index=False)
test_df.to_csv("Karnataka_Datasets/Across/SAR/Karnataka_SAR_test.csv", index=False)


Train Crop_Name distribution:
 Crop_Name
Ragi          0.280304
Coconut       0.157504
Rose          0.135930
Mangoes       0.090486
Arecanut      0.079094
Avare         0.073817
Guava         0.046904
Redgram       0.043303
Sapota        0.024274
Banana        0.023778
Jowar         0.019867
Maize         0.018966
Paddy         0.003352
Lemon         0.001707
Eucalyptus    0.000341
Sugarcane     0.000217
Bajra         0.000155
Name: proportion, dtype: float64

Test Crop_Name distribution:
 Crop_Name
Ragi          0.280358
Coconut       0.157561
Rose          0.135957
Mangoes       0.090514
Arecanut      0.079091
Avare         0.073752
Guava         0.046933
Redgram       0.043333
Sapota        0.024212
Banana        0.023839
Jowar         0.019866
Maize         0.018873
Paddy         0.003352
Lemon         0.001738
Eucalyptus    0.000372
Sugarcane     0.000124
Bajra         0.000124
Name: proportion, dtype: float64
