In [1]:
import numpy as np
import pandas as pd

In [None]:
# Path to the directory with CSV files
input_dir = "Karnataka_Datasets/Across/Begalavi/SAR_NDVI/"
all_files = glob.glob(os.path.join(input_dir, "*.csv"))

# Read and concatenate all CSV files
df_list = [pd.read_csv(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)

# Drop 'system:index'
df.drop(columns=['system:index'], inplace=True, errors='ignore')

# Extract column names
columns = df.columns

# Get and sort NDVI columns (e.g., '0_gapfilled_NDVI_lsc')
ndvi_cols = sorted(
    [col for col in columns if col.endswith('_gapfilled_NDVI_lsc')],
    key=lambda x: int(re.match(r"(\d+)_gapfilled_NDVI_lsc", x).group(1))
)

# Rename to GCVI_{i+1}
ndvi_renamed = {col: f'GCVI_{i+1}' for i, col in enumerate(ndvi_cols)}

# Get and sort VH columns (e.g., 'VH_1', 'VH_2')
vh_cols = sorted(
    [col for col in columns if col.startswith('VH_')],
    key=lambda x: int(re.match(r'VH_(\d+)', x).group(1))
)

# Get and sort VV columns (e.g., 'VV_1', 'VV_2')
vv_cols = sorted(
    [col for col in columns if col.startswith('VV_')],
    key=lambda x: int(re.match(r'VV_(\d+)', x).group(1))
)

# Apply renaming
df.rename(columns=ndvi_renamed, inplace=True)

# Final columns in desired order
final_columns = ['Crop_Name', '.geo'] + list(ndvi_renamed.values()) + vh_cols + vv_cols

# Reorder columns
df = df[final_columns]

# Save the result (optional)
df.to_csv(os.path.join(input_dir, "Begalavi_SAR_NDVI.csv"), index=False)

# Remove Duplicate VV, VH and GCVI Rows

In [None]:
# Load CSV
df = pd.read_csv('your_file.csv')

# Identify time series column groups
vv_cols = [col for col in df.columns if col.startswith('VV_')]
vh_cols = [col for col in df.columns if col.startswith('VH_')]
gcvi_cols = [col for col in df.columns if col.startswith('GCVI_')]

# Helper: mark all duplicates except the first
def mark_duplicates(df, cols):
    sequences = df[cols].apply(lambda row: tuple(row), axis=1)
    return sequences.duplicated(keep='first')  # only mark subsequent duplicates

# Identify rows to drop (any duplicate in any group)
vv_dup = mark_duplicates(df, vv_cols)
vh_dup = mark_duplicates(df, vh_cols)
gcvi_dup = mark_duplicates(df, gcvi_cols)

# Create a final mask: keep only rows that are not marked duplicate in any group
mask = ~(vv_dup | vh_dup | gcvi_dup)

# Filter and save
filtered_df = df[mask]
filtered_df.to_csv('filtered_unique_timeseries.csv', index=False)

print("✅ Saved filtered file with first occurrences kept as 'filtered_unique_timeseries.csv'")
