# All Imports

In [None]:
from glob import glob
import numpy as np
import os
import pandas as pd

### Find the Max Number of Timesteps for any Point

In [21]:

# Load your CSV
df = pd.read_csv('Karnataka_Datasets/Across/Sample/Karnataka_Chunk_10_90.csv')

# Convert 'Date' to datetime (format: DD-MM-YYYY)
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Drop duplicates to avoid extra time steps
df = df.drop_duplicates(subset=['Latitude', 'Longitude', 'Date'])

# Sort by Latitude, Longitude, and Date
df = df.sort_values(by=['Latitude', 'Longitude', 'Date'])

# Set multi-index for easy grouping
df.set_index(['Latitude', 'Longitude', 'Date'], inplace=True)

# Columns of interest for vegetation indices and bands
columns_of_interest = ['NDVI', 'B11', 'B12', 'B3', 'B4', 'B8', 'B8A', 'GCVI']

# Bring 'Date' back as a column
df = df.reset_index()

# Recalculate TimeIndex using date ranking per point
df['TimeIndex'] = df.groupby(['Latitude', 'Longitude'])['Date'].rank(method='first').astype(int)

# Melt the dataframe to long format for pivoting
df_melt = df.melt(id_vars=['Latitude', 'Longitude', 'TimeIndex'], 
                  value_vars=columns_of_interest, 
                  var_name='Variable', 
                  value_name='Value')

# Create a new DataFrame that represents every possible (Lat, Lon, TimeIndex) combination
max_time_index = df['TimeIndex'].max()
time_index_range = pd.DataFrame({'TimeIndex': range(1, max_time_index + 1)})

# Create a grid of every Latitude, Longitude and TimeIndex combination
lat_lon_combinations = df[['Latitude', 'Longitude']].drop_duplicates()
full_grid = pd.merge(lat_lon_combinations, time_index_range, how='cross')

# Merge the original melted DataFrame with the full grid to ensure all time indices
df_full = pd.merge(full_grid, df_melt, on=['Latitude', 'Longitude', 'TimeIndex'], how='left')

# Pivot to get variables as columns (VI/Band_timeIndex)
df_pivot = df_full.pivot(index=['Latitude', 'Longitude'], 
                         columns=['Variable', 'TimeIndex'], 
                         values='Value')

# Reindex columns to ensure all (Variable, TimeIndex) combos exist
full_column_index = pd.MultiIndex.from_product(
    [columns_of_interest, range(1, max_time_index + 1)],
    names=['Variable', 'TimeIndex']
)
df_pivot = df_pivot.reindex(columns=full_column_index)

# Flatten the multi-level columns to something more readable
df_pivot.columns = [f'{var}_{i}' for var, i in df_pivot.columns]

# Reset the index for final output
df_pivot = df_pivot.reset_index()

# Merge Crop_Name back to the dataset
crop_name_df = df[['Latitude', 'Longitude', 'Crop_Name']].drop_duplicates()
df_pivot = df_pivot.merge(crop_name_df, on=['Latitude', 'Longitude'], how='left')

# Save the final result
df_pivot.to_csv('Karnataka_Datasets/', index=False)

# Optional: Print counts of available time steps per (Latitude, Longitude) pair
point_counts = df.groupby(['Latitude', 'Longitude']).size().reset_index(name='TimeSteps')
print("\n🔢 Time steps per point (Top 10):")
print(point_counts.head(10))

# Optional: Check max time steps
print(f"\n📈 Max time steps for any point: {point_counts['TimeSteps'].max()}")


🔢 Time steps per point (Top 10):
    Latitude  Longitude  TimeSteps
0  13.086502  77.475513         70
1  13.086503  77.475486         70
2  13.086532  77.473382         70
3  13.086533  77.473390         70
4  13.086534  77.473396         70
5  13.086914  77.472185         70
6  13.086940  77.472175         70
7  13.086987  77.475678         70
8  13.086998  77.472010         70
9  13.087005  77.471977         70

📈 Max time steps for any point: 70


### Function: Flatten and Concatenate a CSV Directory

In [2]:

def process_file(filepath):
    df = pd.read_csv(filepath)

    # Parse date
    df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce')

    # Drop duplicates
    df = df.drop_duplicates(subset=['Latitude', 'Longitude', 'Date'])

    # Sort and reset index
    df = df.sort_values(by=['Latitude', 'Longitude', 'Date'])
    df = df.reset_index(drop=True)

    # Assign TimeIndex per (Latitude, Longitude) using date rank
    df['TimeIndex'] = df.groupby(['Latitude', 'Longitude'])['Date'].rank(method='first').astype(int)

    # Columns to use
    columns_of_interest = ['NDVI', 'B11', 'B12', 'B3', 'B4', 'B8', 'B8A', 'GCVI']

    # Melt to long format
    df_melt = df.melt(id_vars=['Latitude', 'Longitude', 'TimeIndex'], 
                      value_vars=columns_of_interest, 
                      var_name='Variable', 
                      value_name='Value')

    # Prepare full grid to ensure all time indices are retained
    max_time_index = df['TimeIndex'].max()
    time_index_range = pd.DataFrame({'TimeIndex': range(1, max_time_index + 1)})
    lat_lon_combinations = df[['Latitude', 'Longitude']].drop_duplicates()
    full_grid = pd.merge(lat_lon_combinations, time_index_range, how='cross')

    df_full = pd.merge(full_grid, df_melt, on=['Latitude', 'Longitude', 'TimeIndex'], how='left')

    # Pivot: one column per variable per time step
    df_pivot = df_full.pivot(index=['Latitude', 'Longitude'], 
                             columns=['Variable', 'TimeIndex'], 
                             values='Value')

    # Reindex columns for missing combinations
    full_column_index = pd.MultiIndex.from_product(
        [columns_of_interest, range(1, max_time_index + 1)],
        names=['Variable', 'TimeIndex']
    )
    df_pivot = df_pivot.reindex(columns=full_column_index)

    # Flatten column names
    df_pivot.columns = [f'{var}_{i}' for var, i in df_pivot.columns]
    df_pivot = df_pivot.reset_index()

    # Merge crop name back
    crop_name_df = df[['Latitude', 'Longitude', 'Crop_Name']].drop_duplicates()
    df_pivot = df_pivot.merge(crop_name_df, on=['Latitude', 'Longitude'], how='left')

    return df_pivot


# === Main logic ===

input_dir = 'Karnataka_Datasets/Across_QA/90/'  # change this as needed
output_path = os.path.join(input_dir, 'Karnataka_Merged_S2.csv')

# List all CSV files
csv_files = glob(os.path.join(input_dir, '*.csv'))

# List to hold all processed DataFrames
processed_list = []

# Process each CSV
for file in csv_files:
    print(f'📂 Processing: {os.path.basename(file)}')
    processed_df = process_file(file)
    processed_list.append(processed_df)

# Merge all processed files
final_df = pd.concat(processed_list, ignore_index=True)

# Save final merged result
final_df.to_csv(output_path, index=False)
print(f'\n✅ All files processed and saved to: {output_path}')

📂 Processing: Karnataka_Chunk_10_QA.csv
📂 Processing: Karnataka_Chunk_1_QA.csv
📂 Processing: Karnataka_Chunk_2_QA.csv
📂 Processing: Karnataka_Chunk_3_QA.csv
📂 Processing: Karnataka_Chunk_4_QA.csv
📂 Processing: Karnataka_Chunk_5_QA.csv
📂 Processing: Karnataka_Chunk_6_QA.csv
📂 Processing: Karnataka_Chunk_7_QA.csv
📂 Processing: Karnataka_Chunk_8_QA.csv
📂 Processing: Karnataka_Chunk_9_QA.csv

✅ All files processed and saved to: Karnataka_Datasets/Across_QA/90/Karnataka_Merged_S2.csv


### Step: Reading CSV Files

In [9]:

# === Config ===
input_dir = 'Karnataka_Datasets/Across/Cloud_Prob/'  # replace with your folder path
output_path = os.path.join(input_dir, 'Karnataka_Merged_S2.csv')

# === Parameters ===
features = ['NDVI', 'GCVI', 'B11', 'B12', 'B3', 'B4', 'B8', 'B8A']

# === Helper Function ===
def process_file(filepath):
    df = pd.read_csv(filepath)
    
    # Clean lat/lon
    df['Latitude'] = df['Latitude'].round(6)
    df['Longitude'] = df['Longitude'].round(6)

    # Parse date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])

    # Determine full date range with 5-day interval
    min_date, max_date = df['Date'].min(), df['Date'].max()
    full_dates = pd.date_range(start=min_date, end=max_date, freq='5D')

    # Full lat/lon-date grid
    latlon = df[['Latitude', 'Longitude']].drop_duplicates()
    full_index = latlon.assign(key=1).merge(pd.DataFrame({'Date': full_dates, 'key': 1}), on='key').drop('key', axis=1)

    # Merge and melt
    df_merged = pd.merge(full_index, df[['Latitude', 'Longitude', 'Date'] + features],
                         on=['Latitude', 'Longitude', 'Date'], how='left')
    
    df_long = df_merged.melt(id_vars=['Latitude', 'Longitude', 'Date'],
                             value_vars=features, var_name='Variable', value_name='Value')

    # Pivot to wide format
    df_pivot = df_long.pivot_table(index=['Latitude', 'Longitude'],
                                   columns=['Variable', 'Date'],
                                   values='Value')

    # Flatten column names
    df_pivot.columns = [f"{var}_{date.strftime('%Y-%m-%d')}" for var, date in df_pivot.columns]
    df_pivot = df_pivot.reindex(sorted(df_pivot.columns), axis=1)
    df_pivot.reset_index(inplace=True)

    return df_pivot

# === Main logic ===
csv_files = glob(os.path.join(input_dir, '*.csv'))
all_processed = []

for file in csv_files:
    print(f'📂 Processing: {os.path.basename(file)}')
    try:
        processed_df = process_file(file)
        all_processed.append(processed_df)
    except Exception as e:
        print(f'❌ Failed to process {file}: {e}')

# Merge and save
if all_processed:
    final_df = pd.concat(all_processed, ignore_index=True)
    final_df.to_csv(output_path, index=False)
    print(f'\n✅ All files processed and saved to: {output_path}')
else:
    print("⚠️ No files were processed.")

📂 Processing: chunk_1_QA_CloudProb.csv

✅ All files processed and saved to: Karnataka_Datasets/Across/Cloud_Prob/Karnataka_Merged_S2.csv


### Step: Reading CSV Files

In [14]:

# Load the CSV file
df = pd.read_csv('Karnataka_Datasets/Across/Cloud_Prob/chunk_1_QA_CloudProb.csv')

# Convert the date column to datetime (if it's not already)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop any rows where date conversion failed (optional)
df = df.dropna(subset=['Date'])

# Print unique dates
unique_dates = df['Date'].dt.date.unique()
print(sorted(unique_dates))

[datetime.date(2021, 4, 1), datetime.date(2021, 4, 6), datetime.date(2021, 4, 11), datetime.date(2021, 4, 16), datetime.date(2021, 4, 21), datetime.date(2021, 4, 26), datetime.date(2021, 5, 1), datetime.date(2021, 5, 6), datetime.date(2021, 5, 11), datetime.date(2021, 5, 16), datetime.date(2021, 5, 21), datetime.date(2021, 5, 26), datetime.date(2021, 5, 31), datetime.date(2021, 6, 5), datetime.date(2021, 6, 10), datetime.date(2021, 6, 15), datetime.date(2021, 6, 20), datetime.date(2021, 6, 25), datetime.date(2021, 6, 30), datetime.date(2021, 7, 5), datetime.date(2021, 7, 10), datetime.date(2021, 7, 15), datetime.date(2021, 7, 20), datetime.date(2021, 7, 25), datetime.date(2021, 7, 30), datetime.date(2021, 8, 4), datetime.date(2021, 8, 9), datetime.date(2021, 8, 14), datetime.date(2021, 8, 19), datetime.date(2021, 8, 24), datetime.date(2021, 8, 29), datetime.date(2021, 9, 3), datetime.date(2021, 9, 8), datetime.date(2021, 9, 13), datetime.date(2021, 9, 18), datetime.date(2021, 9, 23), d

### Step: Reading CSV Files

In [3]:
# Load data
df = pd.read_csv('Karnataka_Datasets/Across/Cloud_Prob/90/chunk_1_QA_CloudProb.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Round Latitude/Longitude to handle floating-point precision issues
df['Latitude'] = df['Latitude'].round(6)
df['Longitude'] = df['Longitude'].round(6)

# Generate full 5-day date range
date_range = pd.date_range(start='2021-04-01', end='2022-03-27', freq='5D')

# Get unique Lat-Long pairs
unique_lat_lon = df[['Latitude', 'Longitude']].drop_duplicates()
lat_lon_tuples = list(unique_lat_lon.itertuples(index=False, name=None))

# Create MultiIndex of all Lat-Long-Date combinations
full_index = pd.MultiIndex.from_product(
    [lat_lon_tuples, date_range],
    names=['Lat-Long', 'Date']
)

# Convert to DataFrame and split Lat-Long into columns
full_df = (
    pd.DataFrame(index=full_index)
    .reset_index()
    .assign(
        Latitude=lambda x: x['Lat-Long'].apply(lambda ll: ll[0]),  # Use apply for tuples
        Longitude=lambda x: x['Lat-Long'].apply(lambda ll: ll[1])
    )
    .drop(columns='Lat-Long')
)

# Merge with original data
merged_df = pd.merge(
    full_df,
    df[['Latitude', 'Longitude', 'Date', 'NDVI']],
    on=['Latitude', 'Longitude', 'Date'],
    how='left'
)

# Pivot to wide format
pivot_df = merged_df.pivot_table(
    index=['Latitude', 'Longitude'],
    columns='Date',
    values='NDVI',
    aggfunc='first'
)

# Reindex columns to include ALL 5-day dates (even if missing in data)
pivot_df = pivot_df.reindex(columns=date_range, fill_value=np.nan)

# Rename columns to NDVI_YYYY-MM-DD format
pivot_df.columns = [f'NDVI_{date.strftime("%Y-%m-%d")}' for date in pivot_df.columns]

# Reset index for CSV output
pivot_df.reset_index(inplace=True)

# Save
pivot_df.to_csv('Karnataka_Datasets/Across/Cloud_Prob/90/Karnataka_Merged_S2.csv', index=False)

### Function: `process_csv`

In [5]:

def process_csv(file_path, date_range):
    """Process a single CSV file into the time series format."""
    # Load data
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Round coordinates to avoid precision mismatches
    df['Latitude'] = df['Latitude'].round(6)
    df['Longitude'] = df['Longitude'].round(6)
    
    # Get unique Lat-Long pairs
    unique_lat_lon = df[['Latitude', 'Longitude']].drop_duplicates()
    lat_lon_tuples = list(unique_lat_lon.itertuples(index=False, name=None))
    
    # Create full index of all Lat-Long-Date combinations
    full_index = pd.MultiIndex.from_product(
        [lat_lon_tuples, date_range],
        names=['Lat-Long', 'Date']
    )
    
    # Convert to DataFrame and split Lat-Long into columns
    full_df = (
        pd.DataFrame(index=full_index)
        .reset_index()
        .assign(
            Latitude=lambda x: x['Lat-Long'].apply(lambda ll: ll[0]),
            Longitude=lambda x: x['Lat-Long'].apply(lambda ll: ll[1])
        )
        .drop(columns='Lat-Long')
    )
    
    # Merge with original data
    merged_df = pd.merge(
        full_df,
        df[['Latitude', 'Longitude', 'Date', 'NDVI']],
        on=['Latitude', 'Longitude', 'Date'],
        how='left'
    )
    
    # Pivot and reindex to ensure all dates are included
    pivot_df = merged_df.pivot_table(
        index=['Latitude', 'Longitude'],
        columns='Date',
        values='NDVI',
        aggfunc='first'
    ).reindex(columns=date_range, fill_value=np.nan)
    
    # Rename columns
    pivot_df.columns = [f'NDVI_{date.strftime("%Y-%m-%d")}' for date in pivot_df.columns]
    return pivot_df.reset_index()

# Configuration
INPUT_DIR = 'Karnataka_Datasets/Across/Cloud_Prob/90/'  # Directory containing all CSV files
OUTPUT_DIR = 'Karnataka_Datasets/Across/Cloud_Prob/90/'          # Directory to save merged result
MERGED_FILENAME = 'Karnataka_Merged_S2.csv'

# Generate full date range
date_range = pd.date_range(start='2021-04-01', end='2022-03-27', freq='5D')

# Process all CSV files
all_dfs = []
for csv_file in glob(os.path.join(INPUT_DIR, '*.csv')):
    print(f"Processing {os.path.basename(csv_file)}...")
    df = process_csv(csv_file, date_range)
    all_dfs.append(df)

# Combine all DataFrames
merged_df = pd.concat(all_dfs, ignore_index=True)

# Remove duplicates (if any)
merged_df = merged_df.drop_duplicates(subset=['Latitude', 'Longitude'], keep='first')

# Save merged result
merged_df.to_csv(os.path.join(OUTPUT_DIR, MERGED_FILENAME), index=False)
print(f"Merged file saved to {os.path.join(OUTPUT_DIR, MERGED_FILENAME)}")

Processing chunk_10_QA_CloudProb.csv...
Processing chunk_11_QA_CloudProb.csv...
Processing chunk_12_QA_CloudProb.csv...
Processing chunk_13_QA_CloudProb.csv...
Processing chunk_14_QA_CloudProb.csv...
Processing chunk_15_QA_CloudProb.csv...
Processing chunk_16_QA_CloudProb.csv...
Processing chunk_1_QA_CloudProb.csv...
Processing chunk_2_QA_CloudProb.csv...
Merged file saved to Karnataka_Datasets/Across/Cloud_Prob/90/Karnataka_Merged_S2.csv
