### Description
- This notebook is used to take many .tiff files that hold multispectral data and then convert those .tiff files into one pandas dataframe that can be saved as a csv file.

Last Editted: 12/08/2024 By Samuel Hobbs (**To whom all credit is due for this notebook**)

## Setup 

### Installs

In [58]:
!pip install rasterio



### Imports

In [59]:
import rasterio
from rasterio.plot import show
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join, basename, normpath
import multiprocessing
from ast import literal_eval

### Connect Google Drive

In [60]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### File Paths; CHANGE SAVE LOCATIONS



In [61]:
#Directories
main_dir = '/content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_training_work/' # The Main directory you are working in
samples_dir = join(main_dir, 'Samples/')
samples_1 = 'avirisng_sample_subset/'
samples_2 = 'avirisng_sample_extra/'
samples_3 = 'avirisng_sample_extra2/'
labels_dir = join(main_dir, 'Sample_Boxes_Labels/uneditted_sample_csv/')
csv_dir = join(main_dir,'valid_samples_1d/Samples_12_--_24/')                           # CHANGE NAME TO NOT OVERWRITE

#Files
labels_file_1 = 'Majority_Labeling.xlsx_-_Sheet1.csv'
labels_file_2 = 'Extras_Majority_Labeling.xlsx - Sheet1.csv'
labels_file_3 = 'Extras_2_Majority_Labeling.xlsx - Sheet1.csv'
csv_file_name_samples = 'samples.csv'                                                   # CHANGE NAME TO NOT OVERWRITE
csv_file_name_uid     = 'files.csv'                                                     # CHANGE NAME TO NOT OVERWRITE

##Labels
path_labels_1 = join(labels_dir, labels_file_1) #file
path_labels_2 = join(labels_dir, labels_file_2) #file
path_labels_3 = join(labels_dir, labels_file_3) #file

### All the Labels that will be passed, index corrisponds to samples index in 'all_samples'
all_labels = [path_labels_1, path_labels_2, path_labels_3] #files


##Samples
path_samples_1 = join(samples_dir, samples_1) #directory
path_samples_2 = join(samples_dir, samples_2) #directory
path_samples_3 = join(samples_dir, samples_3) #directory

### All the Samples that will be passed, index corrisponds to labels index in 'all_labels'
all_samples = [path_samples_1, path_samples_2, path_samples_3] #directories


##Save CSV
path_to_save_sample_csv = join(csv_dir, csv_file_name_samples) #file
path_to_save_uid_filename_csv = join(csv_dir, csv_file_name_uid) #file


# The Columns to read the Labels from their original CSVs, index corrisponds to label number.
label_cols = [('Sample_num','Class.4'), ('Sample_num','Class.3'), ('Sample_num','Class.4')]

### General Functions

In [62]:
def get_labels(file_path, col1 = 'Sample_num', col2 = 'Class.4', name_col_id = 'Sample_num', name_col_label = 'Label'):
  '''
  Description:
    This function reads a csv from the provided file path and then extracts the
    sample number, or equivalant id that connects to a sample, and Label. Then
    returns a pandas dataframe of all of those.
  Input:
    file_path         : string, filename of the csv of labels.
    col1              : string, the column name of sample numbers in the csv.
                        Assumes it is integers. Default is 'Sample_num'.
    col2              : string, the column name of labels in the csv. Assumes it
                        is strings. Default is 'Class.4'.
    name_col_id       : string, the name of the column for sample numbers in
                        returned pandas dataframe. Default is 'Sample_num'.
    name_col_label    : string, the name of the column for labels in returned
                        pandas dataframe. Default is 'Label'.
  Output:
    Pandas DataFrame  : A pandas dataframe of labels. Sorted by Sample Numbers.
                        (i.e. 1,2,3,4,...)
  '''
  return pd.read_csv(file_path)[[col1, col2]].rename(columns={col1: name_col_id, col2: name_col_label}).sort_values(by=name_col_id, ascending=True)

In [63]:
def trim_data_files(filenames, labels, sample_num_col_name = 'Sample_num'):
  '''
  Description:
    This function compared the provided labels with the files to trim down the
    list of files to only those with corrisponding labels. If a file does not
    have a label it will be dropped from the list.
    Things to take note of; This funtion works under the assumption that there
    can be multiple files connected to the same label. It also assumes both file
    names and labels are sorted in the same manner as each other.
    (Ex. sample numbers [0,1,2,3,...]) If using this function in conjunction
    with get_labels the labels are sorted lest to greatest with their sample
    numbers and file names should be the same.
  Input:
    filenames           : Numpy Array, A sorted numpy array of file name
                          strings, sorted by sample number (i.e. 0,1,2,3,...).
                          Filenames must be in the form '#_...' where # is the
                          sample_num. The id connected to the labels.
    labels              : Pandas DataFrame, With column for sample_num or
                          equivalant that matches the file names identifier.
    sample_num_col_name : String, this is a variable where you can specify a
                          specific column name for the sample numbers or
                          equivalant id to the filenames and labels. That will
                          be used to specify the column checked in labels. If
                          you used the defaults of get_labels you don't need to
                          change this.
  Output:
    trimmed_filenames    : a numpy array of strings. This is an array of file
                           names that have corrisponding labels.
  '''
  # Init's
  trimmed_filenames = []
  size_label = labels.shape[0]
  size_filenames = filenames.size
  count_label = 0
  count_filenames = 0

  while(count_label != size_label and count_filenames != size_filenames):       # While we have not seen all labels and we have not seen all filenames
    filename_num = int(filenames[count_filenames].split('_')[0])                ## this gets the first value of the file name (filename_num) seperated at '_'
    label_num = labels[sample_num_col_name][count_label].astype(int)            ## gets the current sample number in the label array at count_label
    if (filename_num == label_num):                                             ## Checks to see if the current file is the label 'count_label'
      trimmed_filenames.append(filenames[count_filenames])                      ### if it is then append the file name to the valid, labeled, filenames list
      count_filenames += 1                                                      ### increment count_filenames +1
    elif (filename_num > label_num):                                            ## Check if filename_num is greater than the sample number in labels at count_label
      count_label += 1                                                          ### if it is then count_label needs to increment +1 as there are no more files that start with that value (sorted lists)
    elif (filename_num < label_num):                                            ## Check if he filename_num is less than the current label number at count_label
      count_filenames += 1                                                      ### if it is then increment count_filenames += 1 as there is no label for it.
    else:                                                                       ## Something has gone horribly wrong
      raise Exception("Something is very wrong in trim_data_files.\nValues: \nS_L:" + str(size_label) + ", S_F: " + str(size_filenames) + "\nC_L: " + str(count_label) + ", C_F: " + str(count_filenames) + "\nL_N: " + str(label_num) + ", F_N: " + str(filename_num))

  #check to see if there are some matching samples and labels
  assert len(trimmed_filenames) > 0, "None of the labels corrispond to the to the given samples."

  return np.array(trimmed_filenames) # this is a list of files names

In [64]:
def check_res(res, min_res_bound=4.5, max_res_bound=6.5):
  '''
  Description:
    Checks to see if the x/y resolution of a pixel is within the min/max bound.
  Input:
    res     : tuple of the x and y resolution of a pixel value
  Output:
    Bool    : True if with in min/max resolution bounds
  '''
  if (res[0] < min_res_bound) or (res[0] > max_res_bound) or (res[1] < min_res_bound) or (res[1] > max_res_bound):
    return False
  return True

In [65]:
def tiff_to_arr(filepath, min_res=4.5, max_res=6.5):
  '''
  Description:
    This function takes a filepath to a .tiff file, opens it, and reads it as a
    numpy arr. Then returns said array.
  Input:
    filepath  : The file path to the .tiff file, starting from /content/...
  Output:
    numpy arr : A 3 dimensional array of frequency bands for the pixels of an
                image. Or None.
    Bool      : is this an image with a valid resolution
    string    : the shape of the dataset.
  '''
  with rasterio.open(filepath) as dataset:
    if (                                                                        # you could really just put the res check if here... - sam to sam
        check_res(
            dataset.res,
            min_res_bound = min_res,
            max_res_bound = max_res
            )
        ):
      return dataset.read(), True, str(dataset.shape)
  return None, False, ""

In [66]:
def convert_3D_to_1D(data_3D):
  '''
  Description:
    This function takes a 3 dimensional array of frequence bands when each
    individual frequence reading is a NxN 2D array. So this 3D array is BxNxN
    where B is the number of frequence bands. This function will return a N*NxB
    array. Where every individual frequence corresponding to a pixel is in the
    returned 1D array for each of the N*N pixels.
  Input:
    data_3D         : Numpy Array with 3 dimensions of shape (num_band, num_row, num_col)
  Output:
    bands_per_pixel : Numpy Array of shape (num_row * num_col, num_band)
  '''
  ## NOTICE:  This function was replaced by the one line of code at the return.
  ##          It is an O(1) operation and it's space complexity is the size of
  ##          data_3D but it's not creating any copies or a new array. To my
  ##          understanding it is changing the metadata of how the array is read
  ##          so it is very fast compared to before.

  #temp_list_1D_arr = []

  # Access the depth (third dimension) and create 1D arrays
  #for i in range(data_3D.shape[1]):                 # 10 for both data_3D.shape[1] & data_3D.shape[2] to make the 10x10
  #  for j in range(data_3D.shape[2]):
  #    data_1D = data_3D[:, i, j].flatten()          # EX. this will take the [0,0] for every bands then flatten that into a 1D array. For all the bands corresponding to pixel [0,0]
  #    temp_list_1D_arr.append(data_1D)              # append to the temp list
  #return np.array(temp_list_1D_arr)                 # convert the list to a numpy array... because I want to.     NOTE: MAYBE HAVE THIS RETURN A PANDA'S DATAFRAME? -SAM

  return data_3D.reshape(data_3D.shape[0], -1).T

In [67]:
def get_filenames(directory_path):
  '''
   * Description:
   *   gets the name of both files and directories at path_samples
   *   joins the path and the file names then checks if that is a file
   *   if it is a file and not a directory it is added to the list filenames
   *   once the list of file names is created, it is then sorted by number: '#_...'
   *   the list is then converted to numpy array just cause.
   *
   *   convert_to_numpy_array(sort(get_list_of_only_filenames, sort_by_first_number_in_name))
   * Input:
   *   directory_path   : a string that is the file path to the directory
   *                      you want a filename list from.
   * Output:
   *   Sorted Numpy Array of Filenames, array of strings
  '''
  return np.array(sorted([f for f in listdir(directory_path) if isfile(join(directory_path, f))], key=lambda x: int(x.split("_")[0])))

In [68]:
#This function could be rewritten to have only one return but it was written this way for readability.
def make_pandas_dataframe(dir_path, filename, col_labels, label=pd.NA, uid = 0, min_res=4.5, max_res=6.5):
  '''
  Description:
    Converts a tiff file to a pandas dataframe where each row is a pixel of the
    tiff image and every column is the frequency bands of that pixel along with
    various meta data like pixel location, label of image, shape of image, and
    filename. The images are also filtered for acceptable resolution range.
  Input:
    dir_path        : String; This is the directory path to where the .tiff
                      files are stored. (ex: '/content/drive/.../files/')
    filename        : String; This is the name of the .tiff file you want to
                      convert to a pandas dataframe.
    col_labels      : List; A list of strings that are the names of the columns
                      of the bands in the .tiff file.(ex: ['frq1', ..., 'frqN'])
    label           : String; The label of the image of the tiff, all pixels
                      will be assigned this label. Defaults to NaN.
    uid             : Int; This is an integer value that is suposed to be used
                      as an alternative unique identifier for the individual
                      tiff files, that is not the string filename. Defaults to 0
                      if not provided.
    min_res         : Float; The minimum accepted resolution of a pixel in the
                      tiff file. Inclusive. Defaults to 4.5
    max_res         : Float; The maximum accepted resolution of a pixel in the
                      tiff file. Inclusive. Defaults to 6.5
  Output:
    Pandas DataFrame: This is the pandas dataframe of the tiff file provided or
                      None if the .tiff file fails the resolution check of the
                      pixels from the min_res and max_res provided.
    Bool            : Boolean that represents if the tiff file is of appropriate
                      resolution. Dependent on the provided min_res and max_res
                      provided.
  '''
  arr, res_check, shape = tiff_to_arr(join(dir_path, filename), min_res=min_res, max_res=max_res)
  #if filename is in label data for labeling.
  if (res_check):
    ds = convert_3D_to_1D(arr)
    df = pd.DataFrame(ds, columns=col_labels)
    df['Label'] = label
    df['Shape'] = shape
    df['File_UID_Num'] = uid
    df['File'] = filename
    return df, True
  return None, False #None could be arr here but for readability it is None. Also to make sure nothing weird happens with this function it's good to keep it explicit.

### Main Function Call

In [69]:
def get_all_data(sample_directory_paths, label_paths, label_cols, min_res=4.5, max_res=6.5, num_frqs=373):
  '''
  Description:
    This function takes samples and returns a pandas data frame of those samples
    While removing samples that do not have corrisponding labels or do not meet
    the minimum/maximum resolution standards. The Pandas data frame returned is
    of the following form: Each row is a pixel with all of it's frequencies,
    label, shape of the entire image it came from [i.e. (10, 10)], it's unque
    identifier that corrisponds to the file it came from, allong with it's
    file name. Let it be known that their is no unque identifier for a singlar
    row. Only for individual files the a set of rows originate from.
  Input:
    sample_directory_paths  :   List of strings where the strings are directory
                                paths to sample files of type '.tiff'.
    label_paths             :   List of strings where the strings are file paths
                                to labels corrisponding to the samples in the
                                same index in sample_directory_paths, of file
                                type '.csv'.
    label_cols              :   List of tuples, that hold two strings at index
                                0 & 1. These tuples represent column 1 and
                                column 2 that the get_labels function will
                                read from when getting the labels from the CSV.
                                The Index of the list corrisponds to the labels
                                index in label_paths.
    min_res                 :   int, minimum acceptible resolution. Default is
                                4.5.
    max_res                 :   int, maximum acceptible resolution. Default is
                                6.5.
    num_frqs                :   int, this is used to create the column names for
                                the frequency bands. It should be the max number
                                of frequence bands in a .tiff file. Default is
                                373.
  Output:
    return                  :   Pandas Data Frame, of all samples
    return                  :   Numpy Array, of all filenames that are in the
                                returned Pandas DataFrame with their UID.
  '''
  print('--Start--')

  # check to make sure that the samples paths have corrisponding label paths
  assert len(sample_directory_paths) == len(label_paths), "Number of Sample Paths does not equal the Number of Label Paths provided."

  # Check to make sure that the number of columns to read the label csv from equal the number of provided CSVs.
  assert len(label_paths) == len(label_cols), "Number of Labels does not equal the Number of csv Column names to read from."

  #unque file identifier, that is an int... thats all.
  uid_count = 1

  # this is the counter for what file path we are currently working through from
  # the passed lists of sample_directory_paths & label_paths
  filecounts = 0

  # A list of files that are valid and used to make the final dataframe, along
  # with their corrisponding UID.
  included_files = []

  # Creates the frequency labels for the dataframe
  columns_of_frequencies = []
  for i in range(0,num_frqs,1):
    columns_of_frequencies.append("frq" + str(i))
  #END FOR

  # loop through and add to pandas dataframe
  list_df = []

  # This while loop will work through all sample paths with corrisponding label paths
  while filecounts < len(sample_directory_paths):
    #print progress outputs
    print("\n-New Sample Set-")

    #paths
    curr_samples_path = sample_directory_paths[filecounts]
    curr_labels_path = label_paths[filecounts]

    #print progress outputs, METRICS
    print('Samples: ', basename(normpath(curr_samples_path)))

    # get an array of the sample file names
    filenames = get_filenames(curr_samples_path)

    #print progress outputs, METRICS
    print('Labels: ', basename(normpath(curr_labels_path)))

    # get label data frame, reading from cols 1 and cols 2
    labels = get_labels(curr_labels_path,
                        col1=label_cols[filecounts][0],
                        col2=label_cols[filecounts][1]
                        )

    #print progress outputs, METRICS
    print('Trimming Samples')

    # trim the files to the ones with labels
    trim_filenames = trim_data_files(filenames, labels)
    trim_filenames_length = len(trim_filenames)

    #print progress outputs, METRICS
    print('Number of Files w/ Labels: ', trim_filenames_length, '/', len(filenames))
    print('Time Estimate: ', round((trim_filenames_length*0.4)/60, 0), '-', round((trim_filenames_length*0.7)/60, 0), 'Minutes')
    print('Start Compiling Files, Total: ', trim_filenames_length)

    # a counter to keep track of the number of files with valid resolutions
    valid_res_count = 0 # ONLY FOR METRICS

    for i in range(0, trim_filenames_length):

      #get the sample number for the label
      sample_num = int(trim_filenames[i].split('_')[0])

      #gets the string label with the sample number
      label = labels[labels['Sample_num'] == sample_num]['Label'].values[0]

      # get the dataframe for this specific file
      df, res_check = make_pandas_dataframe(curr_samples_path,
                                            trim_filenames[i],
                                            columns_of_frequencies,
                                            label,
                                            uid = uid_count,
                                            min_res = min_res,
                                            max_res = max_res
                                            )

      #check to see if the file has a valid resolution
      if(res_check):
        list_df.append(df)
        included_files.append((label, uid_count, trim_filenames[i]))

        # Increment UID counter
        uid_count+=1

        # Increment counter
        valid_res_count+=1
      #END IF
    #END FOR

    # Increment filecounter counter... lol
    filecounts+=1

    #print progress outputs
    print('Number of Valid Resolutions: ', valid_res_count, '/', len(trim_filenames))
    print('Sample Set: ', basename(normpath(curr_samples_path)),' - Complete')
    print("-End Sample Set-")

  #END WHILE

  #concat all dataframes
  df = pd.concat(list_df)

  #convert python list of tuples (uid, filename) into pandas dataframe
  #This data frame is mostly redundent. You can extract this info from the main
  #dataframe. This is here more for legacy purposes and convenience.
  df_if = pd.DataFrame(included_files, columns=['Label','UID', 'Filename'])

  print('\n--End--')

  # Returns pandas dataframe of sample and pandas dataframe of included file names w/ uid.
  return df, df_if

In [71]:
#this is redone for multithreading.
def mt_make_pandas_dataframe(dir_path, filename, col_labels, label=pd.NA, uid = 0, min_res=4.5, max_res=6.5):
  '''
  TBD
  '''
  arr, res_check, shape = tiff_to_arr(join(dir_path, filename), min_res=min_res, max_res=max_res)
  #if filename is in label data for labeling.
  if (res_check):
    ds = convert_3D_to_1D(arr)
    df = pd.DataFrame(ds, columns=col_labels)
    df['Label'] = label
    df['Shape'] = shape
    df['File_UID_Num'] = uid
    df['File'] = filename
    return df

#### Create the Pandas Data Frame

In [73]:
df, valid_files = get_all_data(all_samples, all_labels, label_cols) 

--Start--

-New Sample Set-
Samples:  avirisng_sample_subset
Labels:  Majority_Labeling.xlsx_-_Sheet1.csv
Trimming Samples
Number of Files w/ Labels:  3647 / 3650
Time Estimate:  24.0 - 43.0 Minutes
Start Compiling Files, Total:  3647
Number of Valid Resolutions:  3203 / 3647
Sample Set:  avirisng_sample_subset  - Complete
-End Sample Set-

-New Sample Set-
Samples:  avirisng_sample_extra
Labels:  Extras_Majority_Labeling.xlsx - Sheet1.csv
Trimming Samples
Number of Files w/ Labels:  209 / 209
Time Estimate:  1.0 - 2.0 Minutes
Start Compiling Files, Total:  209
Number of Valid Resolutions:  186 / 209
Sample Set:  avirisng_sample_extra  - Complete
-End Sample Set-

-New Sample Set-
Samples:  avirisng_sample_extra2
Labels:  Extras_2_Majority_Labeling.xlsx - Sheet1.csv
Trimming Samples
Number of Files w/ Labels:  929 / 929
Time Estimate:  6.0 - 11.0 Minutes
Start Compiling Files, Total:  929
Number of Valid Resolutions:  788 / 929
Sample Set:  avirisng_sample_extra2  - Complete
-End Sampl

### View Data Frames

In [None]:
# Lets check it out; Pandas Data Frame
df

Unnamed: 0,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,frq9,...,frq367,frq368,frq369,frq370,frq371,frq372,Label,Shape,File_UID_Num,File
0,0.018910,0.025453,0.026801,0.029931,0.025996,0.030246,0.033545,0.035334,0.039591,0.044717,...,0.154073,0.158080,0.160214,0.164343,0.173604,0.190160,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
1,0.013469,0.015442,0.018657,0.021459,0.028462,0.029567,0.035124,0.038447,0.039005,0.042359,...,0.146481,0.146843,0.155816,0.159489,0.157601,0.169750,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
2,0.013482,0.013153,0.018295,0.021435,0.028463,0.031020,0.033894,0.037844,0.039349,0.043371,...,0.146007,0.148808,0.149374,0.162757,0.157468,0.167964,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
3,0.013058,0.024265,0.024345,0.023020,0.027349,0.033271,0.032494,0.036257,0.040206,0.042641,...,0.150941,0.147344,0.155273,0.154464,0.140058,0.165022,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
4,0.015231,0.024812,0.023221,0.024547,0.028005,0.033780,0.032661,0.035639,0.040920,0.043962,...,0.157301,0.157921,0.164166,0.162148,0.151324,0.178857,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.002211,0.011384,0.009327,0.016608,0.016334,0.017891,0.016527,0.018290,0.018646,0.019283,...,0.053987,0.070935,0.070580,0.070435,0.070154,0.069898,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...
60,0.004726,0.014338,0.009449,0.016151,0.017545,0.018127,0.016673,0.019221,0.018761,0.020615,...,0.058488,0.070654,0.070295,0.070197,0.069947,0.069710,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...
61,0.004768,0.011690,0.010392,0.015953,0.014548,0.016499,0.014772,0.017514,0.017669,0.018417,...,0.047195,0.079708,0.079118,0.079025,0.078705,0.078389,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...
62,0.002905,0.012905,0.007541,0.014662,0.014470,0.017955,0.015650,0.016833,0.016918,0.018893,...,0.044651,0.072319,0.072294,0.072256,0.072160,0.072073,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...


In [None]:
# Check out Files and UID; Numpy Array
valid_files

Unnamed: 0,Label,UID,Filename
0,Unconsolidated Barren,1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
1,Unconsolidated Barren,2,2_ang20231028t100428_001_L2A_OE_main_27577724_...
2,Unconsolidated Barren,3,2_ang20231028t095542_011_L2A_OE_main_27577724_...
3,"Consolidated Barren (rocks, salt pans)",4,3_ang20231028t095542_011_L2A_OE_main_27577724_...
4,"Consolidated Barren (rocks, salt pans)",5,3_ang20231028t100428_001_L2A_OE_main_27577724_...
...,...,...,...
4172,"Consolidated Barren (rocks, salt pans)",4173,28496_ang20231109t065855_008_L2A_OE_main_27577...
4173,Shrubs,4174,28497_ang20231109t071216_015_L2A_OE_main_27577...
4174,Shrubs,4175,28497_ang20231031t085208_007_L2A_OE_main_27577...
4175,Natural Wooded Land,4176,28499_ang20231031t085208_007_L2A_OE_main_27577...


### Save To CSV

In [None]:
#Saves Data Frame to CSV
df.to_csv(path_to_save_sample_csv)
valid_files.to_csv(path_to_save_uid_filename_csv)
# You can go down to "Code for others" to add pixel level loctions to the csv.
# Due to how it constructs the the columns it needs to load the csv saved above.
# this is due to the carried over data from concat. You will need to change the
# name of the save samples csv file name if you want to save that created
# dataframe.

##  Adding metadata
- This code is used to add pixel level locations with respect to their loaction in the image.
- For this you need to load from a csv. of the form saved above.

In [None]:
def load_data(sample_path):
  #THIS FUNCTION IS RAM INTENSIVE
  df = pd.read_csv(sample_path) # read in the data from a csv
  df.rename(columns={df.columns[0]: 'img_pxl_index'}, inplace=True) # names an unnamed column
  df['Shape'] = df['Shape'].apply(literal_eval) # convert the string tuples to actual tuples.
  df['img_pos'] = df.apply(add_img_lvl_pixel_loc, axis=1) # there might be a better way to do this thats not so ram heavy *shrugs* *wipes hands clean*
  return df

In [None]:
def add_img_lvl_pixel_loc(row):
  shape = row['Shape']
  index = row['img_pxl_index']
  assert(len(shape) == 2)
  return (index // shape[1], index % shape[1])

### Run

In [None]:
df = load_data(path_to_save_sample_csv)

### Visualize

In [None]:
df

Unnamed: 0,img_pxl_index,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,...,frq368,frq369,frq370,frq371,frq372,Label,Shape,File_UID_Num,File,img_pos
0,0,0.018910,0.025453,0.026801,0.029931,0.025996,0.030246,0.033545,0.035334,0.039591,...,0.158080,0.160214,0.164343,0.173604,0.190160,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...,"(0, 0)"
1,1,0.013469,0.015442,0.018657,0.021459,0.028462,0.029567,0.035124,0.038447,0.039005,...,0.146843,0.155816,0.159489,0.157601,0.169750,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...,"(0, 1)"
2,2,0.013482,0.013153,0.018295,0.021435,0.028463,0.031020,0.033894,0.037844,0.039349,...,0.148808,0.149374,0.162757,0.157468,0.167964,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...,"(0, 2)"
3,3,0.013058,0.024265,0.024345,0.023020,0.027349,0.033271,0.032494,0.036257,0.040206,...,0.147344,0.155273,0.154464,0.140058,0.165022,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...,"(0, 3)"
4,4,0.015231,0.024812,0.023221,0.024547,0.028005,0.033780,0.032661,0.035639,0.040920,...,0.157921,0.164166,0.162148,0.151324,0.178857,Unconsolidated Barren,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...,"(0, 4)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398279,59,0.002211,0.011384,0.009327,0.016608,0.016334,0.017891,0.016527,0.018290,0.018646,...,0.070935,0.070580,0.070435,0.070154,0.069898,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...,"(7, 3)"
398280,60,0.004726,0.014338,0.009449,0.016151,0.017545,0.018127,0.016673,0.019221,0.018761,...,0.070654,0.070295,0.070197,0.069947,0.069710,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...,"(7, 4)"
398281,61,0.004768,0.011690,0.010392,0.015953,0.014548,0.016499,0.014772,0.017514,0.017669,...,0.079708,0.079118,0.079025,0.078705,0.078389,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...,"(7, 5)"
398282,62,0.002905,0.012905,0.007541,0.014662,0.014470,0.017955,0.015650,0.016833,0.016918,...,0.072319,0.072294,0.072256,0.072160,0.072073,Natural Wooded Land,"(8, 8)",4177,28499_ang20231109t071216_015_L2A_OE_main_27577...,"(7, 6)"
