- **Orginally created by: Samuel Hobbs on 12/8/2024**
- **Last edited by: Ben Harris 1/9/2024**

**bold text**## The script’s primary goal is to:

- Read hyperspectral .tif files from designated directories.

- Check that each .tif file matches a valid resolution range (e.g., 4.0 - 7.0 currently).

- Verify that each .tif file has a label in a corresponding CSV (via sample number).

- Convert each .tif file into a pandas DataFrame where each row represents one pixel, and columns represent the spectral bands plus additional metadata (e.g., filename, label, resolution shape).

- Concatenate all the individual DataFrames into a single large DataFrame of pixel-level data for all valid .tif files.

- Save the resulting DataFrame (and an accompanying file-UID map) as a CSV file.

- Once that main CSV is saved we add pixel-level coordinates within each image used for displaying results as well as post-processing morphology.



## Setup


### Installs

In [None]:
!pip install rasterio



### Imports

In [None]:
import rasterio
from rasterio.plot import show
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join, basename, normpath
from google.colab import drive
from ast import literal_eval

### Connect Google Drive

In [None]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### File Paths
NOTE: Please update these paths as needed!

In [None]:
#Directories
main_dir    = '/content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_training_work/'
samples_dir = join(main_dir, 'Samples/')  # Directory where the .tif files & resolution CSV are

tif_dir      = join(samples_dir, 'ANG_L2A_v2_sample_subset')     # Directory containing all .tif files
res_csv_path = join(samples_dir, 'ANG_L2A_v2_sample_subset_resolutions.csv')  # CSV with columns: file_name, x_res, y_res
labels_csv   = join(main_dir, 'labels.csv')  # CSV containing labels for each sample_num

# Output CSV file names and paths
csv_dir                 = join(main_dir, 'Updated/')
csv_file_name_samples   = 'samples.csv'
csv_file_name_uid       = 'files.csv'
path_to_save_sample_csv = join(csv_dir, csv_file_name_samples)
path_to_save_uid_csv    = join(csv_dir, csv_file_name_uid)

# Label columns in labels.csv
label_col_sample_num = 'Sample_num'
label_col_label      = 'Class'

## Functions


### General Functions

In [None]:
def get_labels(
  file_path,
  col1='Sample_num',
  col2='Class',
  name_col_id='Sample_num',
  name_col_label='Label'
):
  """
  Reads a CSV of labels from `file_path`. Subsets two columns (col1, col2),
  renames them, and sorts by col1 ascending. Returns the resulting DataFrame.

  Paramaters:
    file_path: Path to the labels CSV.
    col1: Name of the column in the CSV that holds the sample number.
    col2: Name of the column in the CSV that holds the label/class.
    name_col_id: Desired name for the ID column in the returned DataFrame.
    name_col_label: Desired name for the label column in the returned DataFrame.

  Returns:
    A sorted Pandas DataFrame with columns [name_col_id, name_col_label].
  """
  df_labels = (
    pd.read_csv(file_path)[[col1, col2]]
    .rename(columns={col1: name_col_id, col2: name_col_label})
    .sort_values(by=name_col_id, ascending=True)
  )
  return df_labels

In [None]:
def trim_data_files(
  filenames,
  labels,
  sample_num_col_name='Sample_num'
):
  """
  Trims a list of filenames by matching them to provided labels (by sample_num).
  Allows for multiple filenames per sample number. Only retains filenames whose
  leading integer matches a label's sample number.

  Debug prints are included to help diagnose matching issues:
    - filename_num: the leading integer from the filename
    - label_num   : the label number from the labels DataFrame
    - count_label : index tracking which label we're on
    - count_files : index tracking which file we're on

  Parameters:
    filenames: Sorted list or array of filename strings (ex: '2_ang20231028t095542_004.tif').
    labels: Pandas DataFrame of labels, sorted by sample_num_col_name.
    sample_num_col_name: Column in `labels` that matches the file's leading integer.

  Returns:
  A numpy array of matching filenames.
  """
  trimmed_filenames = []

  size_label      = labels.shape[0]
  size_filenames  = len(filenames)

  count_label     = 0
  count_filenames = 0

  # Debug: print to see how many total files and labels we have
  print(f"[DEBUG] Starting trim_data_files:")
  print(f"        Total Filenames: {size_filenames}")
  print(f"        Total Labels   : {size_label}")

  while (count_label < size_label and count_filenames < size_filenames):
    # Extract the sample number from the filename
    filename_num_str = filenames[count_filenames].split('_')[0]
    filename_num     = int(filename_num_str)  # int('2') -> 2, for example

    # Current label number
    label_num = int(labels[sample_num_col_name].iloc[count_label])

    # Debug print to see each comparison
    print(f"[DEBUG] Comparing filename: '{filenames[count_filenames]}' "
          f"(filename_num={filename_num}) with label_num={label_num} "
          f"[count_label={count_label}, count_filenames={count_filenames}]")

    if filename_num == label_num:
      # We have a match; append the filename to our valid list
      trimmed_filenames.append(filenames[count_filenames])

      # Move to the next filename (but keep the same label index
      # in case there are more files with the same sample number)
      count_filenames += 1

    elif filename_num < label_num:
      # This filename's sample number is behind the current label's sample number.
      # So, there's no label for it. We discard this filename and move on.
      count_filenames += 1

    else:  # filename_num > label_num
      # This file's sample number has overshot the label number.
      # That means we're done with this label_num; move to the next label.
      count_label += 1

  # Double-check we matched something
  assert len(trimmed_filenames) > 0, (
    "No filenames match the given labels. Debug Info:\n"
    f"  # Filenames: {size_filenames}\n"
    f"  # Labels: {size_label}\n"
    f"  Possibly the file naming convention or sample_num in CSV do not align."
  )

  return np.array(trimmed_filenames)


In [None]:
def check_res(
  filename,
  res_dict,
  min_res_bound=4.0,
  max_res_bound=7.0
):
  """
  Checks if the x/y resolution of a given filename is within the min/max bound.
  Instead of reading resolution from Rasterio, we read from a dictionary
  (filename -> (xres, yres)) that was loaded from a CSV file.

  Parameters:
    filename: The .tif filename (string).
    res_dict: Dictionary with structure {filename: (xres, yres)}.
    min_res_bound: Minimum allowed resolution (inclusive).
    max_res_bound: Maximum allowed resolution (inclusive).

  Returns:
    True if within bounds; otherwise False.
  """
  if filename not in res_dict:
    return False

  xres, yres = res_dict[filename]

  if (
    xres < min_res_bound or xres > max_res_bound
    or yres < min_res_bound or yres > max_res_bound
  ):
    return False

  return True


In [None]:
def tif_to_arr(filepath):
  """
  Opens a .tif file with Rasterio and returns its raw data as a numpy array,
  plus the shape as a string.

  Parameters:
    filepath: Full path to the .tif file.

  Returns:
    data_3D -> numpy array of shape (bands, rows, cols)
    shape_str -> string representation of (rows, cols)
  """
  with rasterio.open(filepath) as dataset:
      data_array  = dataset.read()      # shape (bands, rows, cols)
      shape_str   = str(dataset.shape)  # e.g. '(10, 10)'
  return data_array, shape_str


In [None]:
def convert_3D_to_1D(data_3D):
  """
  Reshapes a 3D array (bands, rows, cols) into (rows*cols, bands).

  Parameters:
    data_3D: NumPy array of shape (num_bands, num_rows, num_cols).

  Returns:
    NumPy array of shape (num_pixels, num_bands).
  """
  # Flatten rows*cols, then transpose:
  return data_3D.reshape(data_3D.shape[0], -1).T


In [None]:
def get_filenames(directory_path):
  """
  Retrieves all files in a directory (no subfolders),
  then sorts them by the integer prefix before '_' in the filename.

  Parameters:
    directory_path: Path to the directory containing .tif files.

  Returns:
    A sorted np.array of filenames.
  """
  files_in_dir = [
    f for f in listdir(directory_path)
    if isfile(join(directory_path, f)) and f.lower().endswith('.tif')
  ]
  # Sort by integer portion (split on '_')
  files_in_dir_sorted = sorted(files_in_dir, key=lambda x: int(x.split('_')[0]))
  return np.array(files_in_dir_sorted)


In [None]:
def make_pandas_dataframe(
  dir_path,
  filename,
  sample_num,
  label=pd.NA,
  uid=0
):
  """
  Converts one .tif file (with known valid resolution) to a pandas DataFrame.
  Automatically determines how many spectral bands are available and includes:
    - All frequency columns [frq0, frq1, ...],
    - Sample_num (parsed from the filename or passed in),
    - Per-pixel row, col coordinates,
    - Label,
    - Shape of the overall image,
    - Unique file ID (uid),
    - The filename itself.

  Parameters:
    dir_path: Directory where the .tif file resides.
    filename: The .tif file name.
    sample_num: The integer sample number extracted from the filename (leading prefix).
    label: The label/class for all pixels in this image (default: NaN).
    uid: A unique integer ID for the file (default: 0).

  Returns:
    A pandas DataFrame of shape (num_pixels, num_bands + 6) with columns:
      [frq0, frq1, ..., 'Sample_num', 'img_pxl_index', 'img_pos', 'Label',
      'Shape', 'File_UID_Num', 'File'].
  """
  filepath_full = join(dir_path, filename)

  # Read the 3D data and shape string from the .tif
  data_3D, shape_str = tif_to_arr(filepath_full)
  # data_3D shape -> (bands, rows, cols)

  # Flatten to (rows*cols, bands)
  data_2D = convert_3D_to_1D(data_3D)

  # Dynamically name frequency columns
  num_bands = data_3D.shape[0]
  freq_columns = [f"frq{i}" for i in range(num_bands)]

  # Create DataFrame with band columns
  df = pd.DataFrame(data_2D, columns=freq_columns)

  # Basic metadata
  df['Sample_num']   = sample_num
  df['Label']        = label
  df['Shape']        = shape_str
  df['File_UID_Num'] = uid
  df['File']         = filename

  # Add per-pixel index & (row, col)
  #    - The index of the row in df matches the flattened pixel index
  df['img_pxl_index'] = df.index

  # Parse shape_str -> (rows, cols) as integers
  # shape_str is something like '(100, 100)'
  shape_tuple = literal_eval(shape_str)   # e.g., shape_tuple = (100, 100)
  rows, cols  = shape_tuple

  # Compute row, col for each pixel index
  df['row_coord'] = df['img_pxl_index'] // cols
  df['col_coord'] = df['img_pxl_index'] %  cols

  # Optional: store a combined tuple
  df['img_pos']   = list(zip(df['row_coord'], df['col_coord']))

  # Return final DataFrame
  return df

### Main Function Call

In [None]:
def get_all_data_single_dir(
  tif_directory,
  label_csv_path,
  res_dict,
  min_res=4.0,
  max_res=7.0,
  label_col_1='Sample_num',
  label_col_2='Class'
):
  """
  Reads all .tif files in `tif_directory` and their corresponding labels from `label_csv_path`.
  Checks resolution using `res_dict`. Creates a DataFrame of all valid .tif files,
  automatically extracting the number of spectral bands and computing pixel coordinates.
  """
  print("=== Starting Data Aggregation ===")

  # Load label CSV
  labels = get_labels(
    file_path=label_csv_path,
    col1=label_col_1,
    col2=label_col_2,
    name_col_id='Sample_num',
    name_col_label='Label'
  )

  # Gather and sort filenames in directory
  filenames = get_filenames(tif_directory)

  # Trim filenames to only those that have a matching label
  print("Trimming filenames to match labels...")
  trimmed_filenames = trim_data_files(filenames, labels, 'Sample_num')
  print(f"Number of files with matching labels: {len(trimmed_filenames)} / {len(filenames)}")

  df_list     = []
  included    = []
  uid_counter = 1

  # Iterate over each trimmed filename
  for fname in trimmed_filenames:
    # Extract sample_num from the filename prefix
    sample_num  = int(fname.split('_')[0])

    # Retrieve corresponding label from the labels DataFrame
    label_value = labels.loc[labels['Sample_num'] == sample_num, 'Label'].values[0]

    # Check resolution using CSV-based res_dict
    if check_res(fname, res_dict, min_res_bound=min_res, max_res_bound=max_res):
      # Convert this .tif to a DataFrame (including pixel coords, sample_num, etc.)
      df_temp = make_pandas_dataframe(
          dir_path   = tif_directory,
          filename   = fname,
          sample_num = sample_num,
          label      = label_value,
          uid        = uid_counter
      )

      df_list.append(df_temp)
      included.append((label_value, uid_counter, fname))
      uid_counter += 1

  # Concatenate all valid DataFrames
  if not df_list:
    raise ValueError("No valid .tif files passed the resolution check.")

  df_all = pd.concat(df_list, ignore_index=True)

  # Create a file-level DataFrame
  df_files = pd.DataFrame(included, columns=['Label','UID','Filename'])

  print("=== Data Aggregation Complete ===")
  return df_all, df_files

## Main Run



In [None]:
#############################################
#               LOAD RESOLUTIONS
#############################################
# Loading the CSV of x_res and y_res so that we can check
# resolutions without relying on Rasterio for this information. (bit quicker given we have the metadata)

# The CSV (res_csv_path) must have these columns: file_name, x_res, y_res

# Load the CSV of x_res,y_res -> res_dict
df_resolutions = pd.read_csv(res_csv_path)

res_dict = {}
for idx, row in df_resolutions.iterrows():
  filename = row['file_name']   # must match exactly the .tif filename
  xres     = float(row['x_res'])
  yres     = float(row['y_res'])
  res_dict[filename] = (xres, yres)



In [None]:
# Run the main data retrieval using our single directory approach
# CAUTION: So Slow... this took 22 to 41 minutes to run. Depends on Google, idk?
# Get all data from the single directory
df, valid_files = get_all_data_single_dir(
    tif_directory   = tif_dir,
    label_csv_path  = labels_csv,
    res_dict        = res_dict,
    min_res         = 4.0,
    max_res         = 7.0,
    label_col_1     = label_col_sample_num,
    label_col_2     = label_col_label
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[DEBUG] Comparing filename: '5579_ang20231110t081307_010.tif' (filename_num=5579) with label_num=5577 [count_label=844, count_filenames=1607]
[DEBUG] Comparing filename: '5579_ang20231110t081307_010.tif' (filename_num=5579) with label_num=5579 [count_label=845, count_filenames=1607]
[DEBUG] Comparing filename: '5579_ang20231110t082850_005.tif' (filename_num=5579) with label_num=5579 [count_label=845, count_filenames=1608]
[DEBUG] Comparing filename: '5579_ang20231122t080745_003.tif' (filename_num=5579) with label_num=5579 [count_label=845, count_filenames=1609]
[DEBUG] Comparing filename: '5580_ang20231110t081307_010.tif' (filename_num=5580) with label_num=5579 [count_label=845, count_filenames=1610]
[DEBUG] Comparing filename: '5580_ang20231110t081307_010.tif' (filename_num=5580) with label_num=5580 [count_label=846, count_filenames=1610]
[DEBUG] Comparing filename: '5580_ang20231122t080745_003.tif' (filename_num=5580) w

### View Data Frames

In [None]:
df

Unnamed: 0,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,frq9,...,frq368,Sample_num,Label,Shape,File_UID_Num,File,img_pxl_index,row_coord,col_coord,img_pos
0,0.029069,0.033152,0.035469,0.041350,0.043226,0.045428,0.048409,0.049431,0.052631,0.054922,...,0.171577,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,0,0,0,"(0, 0)"
1,0.030087,0.034019,0.035511,0.041382,0.042764,0.045505,0.049077,0.049998,0.052073,0.055954,...,0.193503,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,1,0,1,"(0, 1)"
2,0.037750,0.037187,0.037512,0.041911,0.046044,0.045247,0.046976,0.050844,0.054013,0.057165,...,0.170450,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,2,0,2,"(0, 2)"
3,0.037750,0.037187,0.037512,0.041911,0.046044,0.045247,0.046976,0.050844,0.054013,0.057165,...,0.170449,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,3,0,3,"(0, 3)"
4,0.031720,0.040508,0.042270,0.042718,0.043992,0.049516,0.051002,0.052987,0.057484,0.058246,...,0.180372,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,4,0,4,"(0, 4)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448853,0.008262,0.007249,0.010430,0.013172,0.014590,0.016147,0.017221,0.016435,0.020677,0.020636,...,0.070380,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,59,7,3,"(7, 3)"
448854,0.002546,0.009940,0.010392,0.014686,0.015467,0.015819,0.018658,0.021157,0.020507,0.019907,...,0.067679,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,60,7,4,"(7, 4)"
448855,0.003634,0.012806,0.014233,0.018610,0.017898,0.017720,0.022039,0.023810,0.023159,0.022642,...,0.102634,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,61,7,5,"(7, 5)"
448856,0.002516,0.013354,0.012534,0.015690,0.016293,0.016206,0.020137,0.022669,0.020269,0.020936,...,0.089147,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,62,7,6,"(7, 6)"


In [None]:
valid_files

Unnamed: 0,Label,UID,Filename
0,Unconsolidated Barren,1,1_ang20231028t101421_005.tif
1,Unconsolidated Barren,2,2_ang20231028t100428_000.tif
2,Unconsolidated Barren,3,2_ang20231028t095542_004.tif
3,Unconsolidated Barren,4,3_ang20231028t100428_000.tif
4,Unconsolidated Barren,5,3_ang20231028t095542_004.tif
...,...,...,...
4647,Wetlands,4648,28496_ang20231109t065855_003.tif
4648,Mixed or Not Classified,4649,28497_ang20231109t071216_006.tif
4649,Mixed or Not Classified,4650,28497_ang20231031t085208_003.tif
4650,Natural Wooded Land,4651,28499_ang20231031t085208_002.tif


In [None]:
df.drop(['row_coord', 'col_coord'], axis=1, inplace=True)

In [None]:
df

Unnamed: 0,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,frq9,...,frq366,frq367,frq368,Sample_num,Label,Shape,File_UID_Num,File,img_pxl_index,img_pos
0,0.029069,0.033152,0.035469,0.041350,0.043226,0.045428,0.048409,0.049431,0.052631,0.054922,...,0.156015,0.173310,0.171577,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,0,"(0, 0)"
1,0.030087,0.034019,0.035511,0.041382,0.042764,0.045505,0.049077,0.049998,0.052073,0.055954,...,0.161190,0.170324,0.193503,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,1,"(0, 1)"
2,0.037750,0.037187,0.037512,0.041911,0.046044,0.045247,0.046976,0.050844,0.054013,0.057165,...,0.177692,0.174815,0.170450,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,2,"(0, 2)"
3,0.037750,0.037187,0.037512,0.041911,0.046044,0.045247,0.046976,0.050844,0.054013,0.057165,...,0.177692,0.174815,0.170449,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,3,"(0, 3)"
4,0.031720,0.040508,0.042270,0.042718,0.043992,0.049516,0.051002,0.052987,0.057484,0.058246,...,0.180998,0.189454,0.180372,1,Unconsolidated Barren,"(11, 11)",1,1_ang20231028t101421_005.tif,4,"(0, 4)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448853,0.008262,0.007249,0.010430,0.013172,0.014590,0.016147,0.017221,0.016435,0.020677,0.020636,...,0.063471,0.067343,0.070380,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,59,"(7, 3)"
448854,0.002546,0.009940,0.010392,0.014686,0.015467,0.015819,0.018658,0.021157,0.020507,0.019907,...,0.062565,0.065229,0.067679,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,60,"(7, 4)"
448855,0.003634,0.012806,0.014233,0.018610,0.017898,0.017720,0.022039,0.023810,0.023159,0.022642,...,0.090310,0.096255,0.102634,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,61,"(7, 5)"
448856,0.002516,0.013354,0.012534,0.015690,0.016293,0.016206,0.020137,0.022669,0.020269,0.020936,...,0.079839,0.085134,0.089147,28499,Natural Wooded Land,"(8, 8)",4652,28499_ang20231109t071216_006.tif,62,"(7, 6)"


### Save To CSV

In [None]:
df.to_csv(path_to_save_sample_csv, index=False)
valid_files.to_csv(path_to_save_uid_csv, index=False)

print(f"Saved pixel-level data to: {path_to_save_sample_csv}")
print(f"Saved file-UID map to:     {path_to_save_uid_csv}")

Saved pixel-level data to: /content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_training_work/Updated/samples.csv
Saved file-UID map to:     /content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_training_work/Updated/files.csv
