In [1]:
import numpy as np
import pandas as pd
import glob
import os
import re

In [2]:
# Path to the directory with CSV files
input_dir = 'Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6/M/'
all_files = glob.glob(os.path.join(input_dir, "*.csv"))

for file_path in all_files:
    try:
        df = pd.read_csv(file_path)

        # Drop 'system:index' if it exists
        df.drop(columns=['system:index'], inplace=True, errors='ignore')

        # Extract column names
        columns = df.columns

        # Get and sort NDVI columns (e.g., '0_gapfilled_NDVI_lsc')
        ndvi_cols = sorted(
            [col for col in columns if col.endswith('_gapfilled_NDVI_lsc')],
            key=lambda x: int(re.match(r"(\d+)_gapfilled_NDVI_lsc", x).group(1))
        )

        # Rename to GCVI_{i+1}
        ndvi_renamed = {col: f'GCVI_{i+1}' for i, col in enumerate(ndvi_cols)}
        df.rename(columns=ndvi_renamed, inplace=True)

        # Get and sort VH columns (e.g., 'VH_1', 'VH_2')
        vh_cols = sorted(
            [col for col in df.columns if col.startswith('VH_')],
            key=lambda x: int(re.match(r'VH_(\d+)', x).group(1))
        )

        # Get and sort VV columns (e.g., 'VV_1', 'VV_2')
        vv_cols = sorted(
            [col for col in df.columns if col.startswith('VV_')],
            key=lambda x: int(re.match(r'VV_(\d+)', x).group(1))
        )

        # Construct final column order
        final_columns = ['Crop_Name']
        if '.geo' in df.columns:
            final_columns.append('.geo')
        final_columns += list(ndvi_renamed.values()) + vh_cols + vv_cols

        # Reorder columns safely
        df = df[[col for col in final_columns if col in df.columns]]

        # Save in place
        df.to_csv(file_path, index=False)
        print(f"Processed and saved: {file_path}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

Processed and saved: Karnataka_Datasets/Across/Kharif/2019-20/SAR_GCVI/AEZ_6/M\bijapura_merged.csv


# RENAME VV, VH COLS

In [5]:
# Base directory containing all CSVs
base_dir = 'Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6/'

# Function to extract date from column name
def extract_date(col):
    match = re.search(r'\d{4}-\d{2}-\d{2}', col)
    return match.group() if match else None

# Traverse all directories and process each CSV
for root, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            print(f"Processing: {file_path}")
            try:
                df = pd.read_csv(file_path)

                # Extract VV and VH columns
                vv_cols = [col for col in df.columns if col.startswith('VV_')]
                vh_cols = [col for col in df.columns if col.startswith('VH_')]

                # Sort columns by date
                vv_cols_sorted = sorted(vv_cols, key=extract_date)
                vh_cols_sorted = sorted(vh_cols, key=extract_date)

                # Create rename maps
                vv_rename_map = {col: f'VV_{i+1}' for i, col in enumerate(vv_cols_sorted)}
                vh_rename_map = {col: f'VH_{i+1}' for i, col in enumerate(vh_cols_sorted)}

                # Apply renaming
                df.rename(columns={**vv_rename_map, **vh_rename_map}, inplace=True)

                # Save back to the same file
                df.to_csv(file_path, index=False)

            except Exception as e:
                print(f"Failed to process {file_path}: {e}")

Processing: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6/bidar_merged.csv
Processing: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6/dharwad_merged.csv
Processing: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6/raichur_merged.csv


# Remove Duplicate VV, VH and GCVI Rows

In [9]:
# Directory containing CSV files
input_dir = 'Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6/'
all_files = glob.glob(os.path.join(input_dir, "*.csv"))

# Helper function to mark duplicates in a group
def mark_duplicates(df, cols):
    sequences = df[cols].apply(lambda row: tuple(row), axis=1)
    return sequences.duplicated(keep='first')

for file_path in all_files:
    try:
        df = pd.read_csv(file_path)

        # Identify time series columns
        vv_cols = [col for col in df.columns if col.startswith('VV_')]
        vh_cols = [col for col in df.columns if col.startswith('VH_')]
        gcvi_cols = [col for col in df.columns if col.startswith('GCVI_')]

        # Mark duplicates in each group
        vv_dup = mark_duplicates(df, vv_cols) if vv_cols else pd.Series([False]*len(df))
        vh_dup = mark_duplicates(df, vh_cols) if vh_cols else pd.Series([False]*len(df))
        gcvi_dup = mark_duplicates(df, gcvi_cols) if gcvi_cols else pd.Series([False]*len(df))

        # Combine all masks: drop rows that are duplicates in any group
        mask = ~(vv_dup | vh_dup | gcvi_dup)
        filtered_df = df[mask]

        # Save in place
        filtered_df.to_csv(file_path, index=False)
        print(f"✅ Filtered and saved: {file_path}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")

✅ Filtered and saved: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6\bidar_merged.csv
✅ Filtered and saved: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6\dharwad_merged.csv
✅ Filtered and saved: Karnataka_Datasets/Across/Kharif/Cropland_Masked/Cropland_Mask_2021_22/AEZ_6\raichur_merged.csv
