##Setup

###Installs

In [96]:
!pip install rasterio



###Imports

In [116]:
import rasterio
from rasterio.plot import show
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join, basename, normpath
import multiprocessing
from ast import literal_eval

###Connect Google Drive

In [98]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### File Paths

In [99]:
#Directories
main_dir = '/content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_training_work/' # The Main directory you are working in
samples_dir = join(main_dir, 'Samples/')
samples_1 = 'avirisng_sample_subset/'
samples_2 = 'avirisng_sample_extra/'
samples_3 = 'avirisng_sample_extra2/'
labels_dir = join(main_dir, 'Sample_Boxes_Labels/uneditted_sample_csv/')
csv_dir = join(main_dir,'valid_samples_1d/Samples_12_01_24/')                           # CHANGE NAME TO NOT OVERWRITE

#Files
labels_file_1 = 'Majority_Labeling.xlsx_-_Sheet1.csv'
labels_file_2 = 'Extras_Majority_Labeling.xlsx - Sheet1.csv'
labels_file_3 = 'Extras_2_Majority_Labeling.xlsx - Sheet1.csv'
csv_file_name_samples = 'samples.csv'                                                   # CHANGE NAME TO NOT OVERWRITE
csv_file_name_uid     = 'files.csv'                                                     # CHANGE NAME TO NOT OVERWRITE


#Paths

##Labels
path_labels_1 = join(labels_dir, labels_file_1) #file
path_labels_2 = join(labels_dir, labels_file_2) #file
path_labels_3 = join(labels_dir, labels_file_3) #file

### All the Labels that will be passed, index corrisponds to samples index in 'all_samples'
all_labels = [path_labels_1, path_labels_2, path_labels_3] #files


##Samples
path_samples_1 = join(samples_dir, samples_1) #directory
path_samples_2 = join(samples_dir, samples_2) #directory
path_samples_3 = join(samples_dir, samples_3) #directory

### All the Samples that will be passed, index corrisponds to labels index in 'all_labels'
all_samples = [path_samples_1, path_samples_2, path_samples_3] #directories


##Save CSV
path_to_save_sample_csv = join(csv_dir, csv_file_name_samples) #file
path_to_save_uid_filename_csv = join(csv_dir, csv_file_name_uid) #file


# The Columns to read the Labels from their original CSVs, index corrisponds to label number.
label_cols = [('Sample_num','Class.4'), ('Sample_num','Class.3'), ('Sample_num','Class.4')]

##Functions


### General Functions

In [5]:
def get_labels(file_path, col1 = 'Sample_num', col2 = 'Class.4'):
  '''
  TBD
  '''
  return pd.read_csv(file_path)[[col1, col2]].rename(columns={col1: "Sample_num", col2: "Label"})

In [6]:
def trim_data_files(filenames, labels):
  '''
  TBD
  '''
  # Init's
  trimmed_filenames = []
  size_label = labels.shape[0]
  size_filenames = filenames.size
  count_label = 0
  count_filenames = 0

  while(count_label != size_label and count_filenames != size_filenames):       # While we have not seen all labels and we have not seen all filenames
    filename_num = int(filenames[count_filenames].split('_')[0])                ## this gets the first value of the file name (filename_num) seperated at '_'
    label_num = labels['Sample_num'][count_label].astype(int)                   ## gets the current sample number in the label array at count_label
    if (filename_num == label_num):                                             ## Checks to see if the current file is the label 'count_label'
      trimmed_filenames.append(filenames[count_filenames])                      ### if it is then append the file name to the valid, labeled, filenames list
      count_filenames += 1                                                      ### increment count_filenames +1
    elif (filename_num > label_num):                                            ## Check if filename_num is greater than the sample number in labels at count_label
      count_label += 1                                                          ### if it is then count_label needs to increment +1 as there are no more files that start with that value (sorted lists)
    elif (filename_num < label_num):                                            ## Check if he filename_num is less than the current label number at count_label
      count_filenames += 1                                                      ### if it is then increment count_filenames += 1 as there is no label for it.
    else:                                                                       ## Something has gone horribly wrong
      raise Exception("Something is very wrong in trim_data_files.\nValues: \nS_L:" + str(size_label) + ", S_F: " + str(size_filenames) + "\nC_L: " + str(count_label) + ", C_F: " + str(count_filenames) + "\nL_N: " + str(label_num) + ", F_N: " + str(filename_num))

  #check to see if there are some matching samples and labels
  assert len(trimmed_filenames) > 0, "None of the labels corrispond to the to the given samples."

  return np.array(trimmed_filenames) # this is a list of files names

In [7]:
def check_res(res, min_res_bound=4.5, max_res_bound=6.5):
  '''
  Description:
    Checks to see if the x/y resolution of a pixel is within the min/max bound.
  Input:
    res     : tuple of the x and y resolution of a pixel value
  Output:
    Bool    : True if with in min/max resolution bounds
  '''
  if (res[0] < min_res_bound) or (res[0] > max_res_bound) or (res[1] < min_res_bound) or (res[1] > max_res_bound):
    return False
  return True

In [8]:
def tiff_to_arr(filepath, min_res=4.5, max_res=6.5):
  '''
  Description:
    This function takes a filepath to a .tiff file, opens it, and reads it as a
    numpy arr. Then returns said array.
  Input:
    filepath  : The file path to the .tiff file, starting from /content/...
  Output:
    numpy arr : A 3 dimensional array of frequency bands for the pixels of an
                image. Or None.
    Bool      : is this an image with a valid resolution
    string    : the shape of the dataset.
  '''
  with rasterio.open(filepath) as dataset:
    if (                                                                        # you could really just put the res check if here... - sam to sam
        check_res(
            dataset.res,
            min_res_bound = min_res,
            max_res_bound = max_res
            )
        ):
      return dataset.read(), True, str(dataset.shape)
  return None, False, ""

In [9]:
def convert_3D_to_1D(data_3D):
  '''
  Description:
    This function takes a 3 dimensional array of frequence bands when each
    individual frequence reading is a NxN 2D array. So this 3D array is BxNxN
    where B is the number of frequence bands. This function will return a N*NxB
    array. Where every individual frequence corresponding to a pixel is in the
    returned 1D array for each of the N*N pixels.
  Input:
    data_3D         : Numpy Array with 3 dimensions of shape (num_band, num_row, num_col)
  Output:
    bands_per_pixel : Numpy Array of shape (num_row * numcol, num_band)
  '''
  #temp_list_1D_arr = []

  # Access the depth (third dimension) and create 1D arrays
  #for i in range(data_3D.shape[1]):                 # 10 for both data_3D.shape[1] & data_3D.shape[2] to make the 10x10
  #  for j in range(data_3D.shape[2]):
  #    data_1D = data_3D[:, i, j].flatten()          # EX. this will take the [0,0] for every bands then flatten that into a 1D array. For all the bands corresponding to pixel [0,0]
  #    temp_list_1D_arr.append(data_1D)              # append to the temp list
  #return np.array(temp_list_1D_arr)      # convert the list to a numpy array... because I want to.     NOTE: MAYBE HAVE THIS RETURN A PANDA'S DATAFRAME? -SAM

  return data_3D.reshape(data_3D.shape[0], -1).T

In [10]:
def get_filenames(directory_path):
  '''
   * Description:
   *   gets the name of both files and directories at path_samples
   *   joins the path and the file names then checks if that is a file
   *   if it is a file and not a directory it is added to the list filenames
   *   once the list of file names is created, it is then sorted by number: '#_...'
   *   the list is then converted to numpy array just cause.
   *
   *   convert_to_numpy_array(sort(get_list_of_only_filenames, sort_by_first_number_in_name))
   * Input:
   *   directory_path   : a string that is the file path to the directory
   *                      you want a filename list from.
   * Output:
   *   Sorted Numpy Array of Filenames, array of strings
  '''
  return np.array(sorted([f for f in listdir(directory_path) if isfile(join(directory_path, f))], key=lambda x: int(x.split("_")[0])))

In [11]:
#This function could be rewritten to have only one return but it was written this way for readability.
def make_pandas_dataframe(dir_path, filename, col_labels, label=pd.NA, uid = 0, min_res=4.5, max_res=6.5):
  '''
  TBD
  '''
  arr, res_check, shape = tiff_to_arr(join(dir_path, filename), min_res=min_res, max_res=max_res)
  #if filename is in label data for labeling.
  if (res_check):
    ds = convert_3D_to_1D(arr)
    df = pd.DataFrame(ds, columns=col_labels)
    df['Label'] = label
    df['Shape'] = shape
    df['File_UID_Num'] = uid
    df['File'] = filename
    return df, True
  return None, False #None could be arr here but for readability it is None. Also to make sure nothing weird happens with this function it's good to keep it explicit.

### Main Function Call

In [12]:
def get_all_data(sample_directory_paths, label_paths, label_cols, min_res=4.5, max_res=6.5):
  '''
  Description:
    This function takes samples and returns a pandas data frame of those samples
    While removing samples that do not have corrisponding labels or do not meet
    the minimum/maximum resolution standards. The Pandas data frame returned is
    of the following form: Each row is a pixel with all of it's frequencies,
    label, shape of the entire image it came from [i.e. (10, 10)], it's unque
    identifier that corrisponds to the file it came from, allong with it's
    file name. Let it be known that their is no unque identifier for a singlar
    row. Only for individual files the a set of rows originate from.
  Input:
    sample_directory_paths  :   List of strings where the strings are directory
                                paths to sample files of type '.tiff'.
    label_paths             :   List of strings where the strings are file paths
                                to labels corrisponding to the samples in the
                                same index in sample_directory_paths, of file
                                type '.csv'.
    label_cols              :   List of tuples, that hold two strings at index
                                0 & 1. These tuples represent column 1 and
                                column 2 that the get_labels function will
                                read from when getting the labels from the CSV.
                                The Index of the list corrisponds to the labels
                                index in label_paths.
    min_res                 :   int, minimum acceptible resolution.
    max_res                 :   int, maximum acceptible resolution.
  Output:
    return                  :   Pandas Data Frame, of all samples
    return                  :   Numpy Array, of all filenames that are in the
                                returned Pandas DataFrame with their UID.
  '''
  print('--Start--')

  # check to make sure that the samples paths have corrisponding label paths
  assert len(sample_directory_paths) == len(label_paths), "Number of Sample Paths does not equal the Number of Label Paths provided."

  # Check to make sure that the number of columns to read the label csv from equal the number of provided CSVs.
  assert len(label_paths) == len(label_cols), "Number of Labels does not equal the Number of csv Column names to read from."

  #unque file identifier, that is an int... thats all.
  uid_count = 1

  # this is the counter for what file path we are currently working through from
  # the passed lists of sample_directory_paths & label_paths
  filecounts = 0

  # A list of files that are valid and used to make the final dataframe, along
  # with their corrisponding UID.
  included_files = []

  # Creates the frequency labels
  columns_of_frequencies = []
  for i in range(0,373,1):
    columns_of_frequencies.append("frq" + str(i))
  #END FOR

  # loop through and add to pandas dataframe
  list_df = []

  # This while loop will work through all sample paths with corrisponding label paths
  while filecounts < len(sample_directory_paths):
    #print progress outputs
    print("\n-New Sample Set-")

    #paths
    curr_samples_path = sample_directory_paths[filecounts]
    curr_labels_path = label_paths[filecounts]

    #print progress outputs
    print('Samples: ', basename(normpath(curr_samples_path)))

    # get an array of the sample file names
    filenames = get_filenames(curr_samples_path)

    #print progress outputs
    print('Labels: ', basename(normpath(curr_labels_path)))

    # get label data frame, reading from cols 1 and cols 2
    labels = get_labels(curr_labels_path,
                        col1=label_cols[filecounts][0],
                        col2=label_cols[filecounts][1]
                        )

    #print progress outputs
    print('Trimming Samples')

    # trim the files to the ones with labels
    trim_filenames = trim_data_files(filenames, labels)
    trim_filenames_length = len(trim_filenames)

    #print progress outputs
    print('Number of Files w/ Labels: ', trim_filenames_length, '/', len(filenames))
    print('Time Estimate: ', round((trim_filenames_length*0.4)/60, 0), '-', round((trim_filenames_length*0.7)/60, 0), 'Minutes')
    print('Start Compiling Files, Total: ', trim_filenames_length)

    # a counter to keep track of the number of files with valid resolutions
    valid_res_count = 0

    for i in range(0, trim_filenames_length):

      #get the sample number for the label
      sample_num = int(trim_filenames[i].split('_')[0])

      #gets the string label with the sample number
      label = labels[labels['Sample_num'] == sample_num]['Label'].values[0]

      # get the dataframe for this specific file
      df, res_check = make_pandas_dataframe(curr_samples_path,
                                            trim_filenames[i],
                                            columns_of_frequencies,
                                            label,
                                            uid = uid_count,
                                            min_res = min_res,
                                            max_res = max_res
                                            )

      #check to see if the file has a valid resolution
      if(res_check):
        list_df.append(df)
        included_files.append((label, uid_count, trim_filenames[i]))

        # Increment UID counter
        uid_count+=1

        # Increment counter
        valid_res_count+=1
      #END IF
    #END FOR
    # Increment filecounter counter
    filecounts+=1

    #print progress outputs
    print('Number of Valid Resolutions: ', valid_res_count, '/', len(trim_filenames))
    print('Sample Set: ', basename(normpath(curr_samples_path)),' - Complete')
    print("-End Sample Set-")

  #END WHILE

  #concat all dataframes
  df = pd.concat(list_df)

  #convert python list of tuples (uid, filename) into pandas dataframe
  #This data frame is mostly redundent. You can extract this info from the main
  #dataframe. This is here more for legacy purposes and convenience.
  df_if = pd.DataFrame(included_files, columns=['Label','UID', 'Filename'])

  print('\n--End--')

  # Returns pandas dataframe of sample and pandas dataframe of included file names w/ uid.
  return df, df_if

###MultiThreaded

####Genral Functions

In [13]:
def get_num_workers():
  num = multiprocessing.cpu_count()
  assert num >= 2
  return num

In [14]:
#this is redone for multithreading.
def mt_make_pandas_dataframe(dir_path, filename, col_labels, label=pd.NA, uid = 0, min_res=4.5, max_res=6.5):
  '''
  TBD
  '''
  arr, res_check, shape = tiff_to_arr(join(dir_path, filename), min_res=min_res, max_res=max_res)
  #if filename is in label data for labeling.
  if (res_check):
    ds = convert_3D_to_1D(arr)
    df = pd.DataFrame(ds, columns=col_labels)
    df['Label'] = label
    df['Shape'] = shape
    df['File_UID_Num'] = uid
    df['File'] = filename
    return df

#### Main MultiThread Function Call

In [15]:
def mt_get_all_data(sample_directory_paths, label_paths, label_cols, min_res=4.5, max_res=6.5):
  '''
  Description:
    This is the same as get_all_data except it implements multithreading.
    This function takes samples and returns a pandas data frame of those samples
    While removing samples that do not have corrisponding labels or do not meet
    the minimum/maximum resolution standards. The Pandas data frame returned is
    of the following form: Each row is a pixel with all of it's frequencies,
    label, shape of the entire image it came from [i.e. (10, 10)], it's unque
    identifier that corrisponds to the file it came from, allong with it's
    file name. Let it be known that their is no unque identifier for a singlar
    row. Only for individual files the a set of rows originate from.
  Input:
    sample_directory_paths  :   List of strings where the strings are directory
                                paths to sample files of type '.tiff'.
    label_paths             :   List of strings where the strings are file paths
                                to labels corrisponding to the samples in the
                                same index in sample_directory_paths, of file
                                type '.csv'.
    label_cols              :   List of tuples, that hold two strings at index
                                0 & 1. These tuples represent column 1 and
                                column 2 that the get_labels function will
                                read from when getting the labels from the CSV.
                                The Index of the list corrisponds to the labels
                                index in label_paths.
    min_res                 :   int, minimum acceptible resolution.
    max_res                 :   int, maximum acceptible resolution.
  Output:
    return                  :   Pandas Data Frame, of all samples
    return                  :   Numpy Array, of all filenames that are in the
                                returned Pandas DataFrame with their UID.
  '''
  print('--Start--')

  # check to make sure that the samples paths have corrisponding label paths
  assert len(sample_directory_paths) == len(label_paths), "Number of Sample Paths does not equal the Number of Label Paths provided."

  # Check to make sure that the number of columns to read the label csv from equal the number of provided CSVs.
  assert len(label_paths) == len(label_cols), "Number of Labels does not equal the Number of csv Column names to read from."

  #unque file identifier, that is an int... thats all.
  uid_count = 1

  # this is the counter for what file path we are currently working through from
  # the passed lists of sample_directory_paths & label_paths
  filecounts = 0

  # A list of files that are valid and used to make the final dataframe, along
  # with their corrisponding UID.
  included_files = []

  # Creates the frequency labels
  columns_of_frequencies = []
  for i in range(0,373,1):
    columns_of_frequencies.append("frq" + str(i))
  #END FOR

  # loop through and add to pandas dataframe
  list_df = []

  # This while loop will work through all sample paths with corrisponding label paths
  while filecounts < len(sample_directory_paths):
    #print progress outputs
    print("\n-New Sample Set-")

    #paths
    curr_samples_path = sample_directory_paths[filecounts]
    curr_labels_path = label_paths[filecounts]

    #print progress outputs
    print('Samples: ', basename(normpath(curr_samples_path)))

    # get an array of the sample file names
    filenames = get_filenames(curr_samples_path)

    #print progress outputs
    print('Labels: ', basename(normpath(curr_labels_path)))

    # get label data frame, reading from cols 1 and cols 2
    labels = get_labels(curr_labels_path,
                        col1=label_cols[filecounts][0],
                        col2=label_cols[filecounts][1]
                        )

    #print progress outputs
    print('Trimming Samples')

    # trim the files to the ones with labels
    trim_filenames = trim_data_files(filenames, labels)
    trim_filenames_length = len(trim_filenames)

    #print progress outputs
    print('Number of Files w/ Labels: ', trim_filenames_length, '/', len(filenames))
    print('Time Estimate: ', round((trim_filenames_length*0.4)/60, 0), '-', round((trim_filenames_length*0.7)/60, 0), 'Minutes')
    print('Start Compiling Files, Total: ', trim_filenames_length)

    # a counter to keep track of the number of files with valid resolutions
    # valid_res_count = 0
    inputs = []
    for i in range(0, trim_filenames_length):

      #get the sample number for the label
      sample_num = int(trim_filenames[i].split('_')[0])

      #gets the string label with the sample number
      label = labels[labels['Sample_num'] == sample_num]['Label'].values[0]

      inputs.append((curr_samples_path,
                     trim_filenames[i],
                     columns_of_frequencies,
                     label,
                     uid_count,
                     min_res,
                     max_res
                     ))

      uid_count+=1
    #END FOR

    num_workers = get_num_workers()


    #processes=num_workers
    with multiprocessing.Pool() as pool:
      results = pool.starmap(mt_make_pandas_dataframe, inputs)
    list_df += results

    #print progress outputs
    #print('Number of Valid Resolutions: ', valid_res_count, '/', len(trim_filenames))
    print('Sample Set: ', basename(normpath(curr_samples_path)),' - Complete')
    print("-End Sample Set-")

    filecounts+=1

  #END WHILE

  #concat all dataframes
  df = pd.concat(list_df)

  #convert python list of tuples (uid, filename) into pandas dataframe
  #This data frame is mostly redundent. You can extract this info from the main
  #dataframe. This is here more for legacy purposes and convenience.
  #df_if = pd.DataFrame(included_files, columns=['UID', 'Filename'])

  print('\n--End--')

  # Returns pandas dataframe of sample and pandas dataframe of included file names w/ uid.
  return df

##Run

In [65]:
#So Slow...
df, valid_files = get_all_data([all_samples[1]], [all_labels[1]], [label_cols[1]]) #this took 22 to 41 minutes to run. Depends on Google. I think.

--Start--

-New Sample Set-
Samples:  avirisng_sample_extra
Labels:  Extras_Majority_Labeling.xlsx - Sheet1.csv
Trimming Samples
Number of Files w/ Labels:  209 / 209
Time Estimate:  1.0 - 2.0 Minutes
Start Compiling Files, Total:  209
Number of Valid Resolutions:  186 / 209
Sample Set:  avirisng_sample_extra  - Complete
-End Sample Set-

--End--


In [None]:
#Multithreaded
#somethings wrong with the threading. I see little to no time improvement...
df_1 = mt_get_all_data([all_samples[2]], [all_labels[2]], [label_cols[2]])

--Start--

-New Sample Set-
Samples:  avirisng_sample_extra2
Labels:  Extras_2_Majority_Labeling.xlsx - Sheet1.csv
Trimming Samples
Number of Files w/ Labels:  929 / 929
Time Estimate:  6.0 - 11.0 Minutes
Start Compiling Files, Total:  929


KeyboardInterrupt: 

### View Data Frames

In [91]:
# Lets check it out; Pandas Data Frame
df.iloc[0]

Unnamed: 0,0
frq0,0.04083
frq1,0.039262
frq2,0.046123
frq3,0.049103
frq4,0.05269
...,...
frq372,0.258372
Label,Annual Crops (e.g wheat)
Shape,"(8, 9)"
File_UID_Num,1


In [89]:
df

Unnamed: 0,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,frq9,...,frq367,frq368,frq369,frq370,frq371,frq372,Label,Shape,File_UID_Num,File
0,0.040830,0.039262,0.046123,0.049103,0.052690,0.060306,0.063361,0.069629,0.074473,0.081587,...,0.239576,0.246793,0.250012,0.252107,0.255135,0.258372,Annual Crops (e.g wheat),"(8, 9)",1,2408_ang20231029t110711_012_L2A_OE_main_275777...
1,0.035390,0.035799,0.041245,0.044327,0.048731,0.051793,0.057151,0.062138,0.067823,0.071327,...,0.211063,0.220510,0.221435,0.222255,0.222488,0.224936,Annual Crops (e.g wheat),"(8, 9)",1,2408_ang20231029t110711_012_L2A_OE_main_275777...
2,0.033294,0.044195,0.046968,0.044567,0.052458,0.058418,0.058755,0.062721,0.069245,0.076854,...,0.218350,0.221983,0.225136,0.228283,0.232234,0.236200,Annual Crops (e.g wheat),"(8, 9)",1,2408_ang20231029t110711_012_L2A_OE_main_275777...
3,0.034774,0.036127,0.041421,0.046593,0.050773,0.054462,0.060933,0.065428,0.069914,0.076345,...,0.215630,0.223428,0.225797,0.229850,0.230700,0.234052,Annual Crops (e.g wheat),"(8, 9)",1,2408_ang20231029t110711_012_L2A_OE_main_275777...
4,0.034614,0.042376,0.046474,0.046373,0.054111,0.056686,0.060972,0.064187,0.070231,0.075515,...,0.227952,0.235214,0.237425,0.239867,0.242431,0.245720,Annual Crops (e.g wheat),"(8, 9)",1,2408_ang20231029t110711_012_L2A_OE_main_275777...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.005915,0.014154,0.018535,0.017707,0.016621,0.019260,0.018152,0.019787,0.021316,0.020557,...,0.074164,0.073345,0.076666,0.078877,0.081096,0.083624,Planted Forest,"(8, 8)",186,8458_ang20231022t105533_037_L2A_OE_main_275777...
60,0.011605,0.009552,0.008141,0.017908,0.019734,0.019101,0.018609,0.021183,0.021963,0.024310,...,0.091963,0.099958,0.102536,0.106039,0.109046,0.112166,Planted Forest,"(8, 8)",186,8458_ang20231022t105533_037_L2A_OE_main_275777...
61,0.014822,0.014847,0.026756,0.016748,0.020747,0.019792,0.022259,0.021147,0.024520,0.025483,...,0.084392,0.087913,0.090791,0.093396,0.096506,0.099245,Planted Forest,"(8, 8)",186,8458_ang20231022t105533_037_L2A_OE_main_275777...
62,0.009923,0.011122,0.006631,0.014264,0.013304,0.016036,0.014566,0.017388,0.018668,0.020201,...,0.069181,0.073434,0.075557,0.079137,0.082291,0.084815,Planted Forest,"(8, 8)",186,8458_ang20231022t105533_037_L2A_OE_main_275777...


In [None]:
# Check out Files and UID; Numpy Array
valid_files

Unnamed: 0,Label,UID,Filename
0,Unconsolidated Barren,1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
1,Unconsolidated Barren,2,2_ang20231028t100428_001_L2A_OE_main_27577724_...
2,Unconsolidated Barren,3,2_ang20231028t095542_011_L2A_OE_main_27577724_...
3,"Consolidated Barren (rocks, salt pans)",4,3_ang20231028t095542_011_L2A_OE_main_27577724_...
4,"Consolidated Barren (rocks, salt pans)",5,3_ang20231028t100428_001_L2A_OE_main_27577724_...
...,...,...,...
4172,"Consolidated Barren (rocks, salt pans)",4173,28496_ang20231109t065855_008_L2A_OE_main_27577...
4173,Shrubs,4174,28497_ang20231109t071216_015_L2A_OE_main_27577...
4174,Shrubs,4175,28497_ang20231031t085208_007_L2A_OE_main_27577...
4175,Natural Wooded Land,4176,28499_ang20231031t085208_007_L2A_OE_main_27577...


### Save To CSV

In [None]:
#Saves Data Frame to CSV
df.to_csv(path_to_save_sample_csv)
valid_files.to_csv(path_to_save_uid_filename_csv)

## Spot checks
(Trash)


In [16]:
# For checking specific samples
with rasterio.open(join(path_samples_1, '1_ang20231028t101421_014_L2A_OE_main_27577724_RFL_ORT.tif')) as ds:
    print("resolution: ", ds.res)
    print("Shape: ", ds.shape)
    ds = ds.read()

resolution:  (4.9, 4.9)
Shape:  (10, 10)


In [23]:
ds[2]

array([[0.02680097, 0.01865721, 0.01829539, 0.02434518, 0.02322113,
        0.02357993, 0.02639832, 0.02707991, 0.02978392, 0.02765991],
       [0.01714127, 0.02413935, 0.02514076, 0.02434518, 0.02604952,
        0.0243514 , 0.02485638, 0.02700877, 0.02978392, 0.02455018],
       [0.01499409, 0.02413935, 0.02533175, 0.0226331 , 0.02528977,
        0.02666448, 0.02430369, 0.02487298, 0.02746968, 0.02711123],
       [0.02398286, 0.02495304, 0.02439311, 0.0226331 , 0.02475102,
        0.02666448, 0.02450481, 0.0321118 , 0.02959609, 0.03352023],
       [0.02109963, 0.02194157, 0.02439311, 0.02458489, 0.02570568,
        0.0254536 , 0.03141659, 0.0321118 , 0.03170523, 0.03352023],
       [0.02109963, 0.02320733, 0.02253373, 0.02421281, 0.02944723,
        0.0284076 , 0.0335991 , 0.03242277, 0.03151374, 0.0277345 ],
       [0.02384901, 0.02378245, 0.02253373, 0.02601866, 0.02944723,
        0.03000867, 0.03077957, 0.03274279, 0.02859406, 0.0277345 ],
       [0.02210709, 0.02378245, 0.0258838

In [19]:
ds[:,0,1]

array([0.01346918, 0.01544228, 0.01865721, 0.02145907, 0.02846207,
       0.02956712, 0.03512356, 0.03844717, 0.03900475, 0.04235928,
       0.04693261, 0.05035537, 0.05463213, 0.05803908, 0.05855815,
       0.05996166, 0.06250343, 0.06436159, 0.06627702, 0.06837496,
       0.07147294, 0.07538021, 0.07706936, 0.07938214, 0.08369323,
       0.08486518, 0.08713029, 0.09118302, 0.09441531, 0.09671456,
       0.09954859, 0.10348529, 0.10670978, 0.10919802, 0.11225521,
       0.11746091, 0.12207057, 0.1249306 , 0.12650824, 0.12878957,
       0.13254014, 0.13563102, 0.13891137, 0.1403915 , 0.14224128,
       0.14517064, 0.1461697 , 0.14920712, 0.14974831, 0.1511704 ,
       0.1518508 , 0.15486827, 0.15564705, 0.15629905, 0.15868089,
       0.15867057, 0.15962036, 0.16033167, 0.16275533, 0.16362947,
       0.16458546, 0.16633454, 0.1704909 , 0.17363927, 0.17401737,
       0.17447199, 0.17552736, 0.17753565, 0.18219285, 0.18306653,
       0.18067567, 0.18018858, 0.1816293 , 0.18313213, 0.18389

In [24]:
arr, res_check, shape = tiff_to_arr(join(path_samples_1, '1_ang20231028t101421_014_L2A_OE_main_27577724_RFL_ORT.tif'), min_res=4.5, max_res=5.5)

In [26]:
arr[1]

array([[0.02545341, 0.01544228, 0.01315289, 0.02426513, 0.02481231,
        0.01259124, 0.016026  , 0.02379542, 0.02572341, 0.02041227],
       [0.01314393, 0.02362236, 0.02386183, 0.02426513, 0.01412701,
        0.0188639 , 0.02014118, 0.02294535, 0.02572341, 0.01742779],
       [0.01366109, 0.02362236, 0.01425563, 0.02078871, 0.01966477,
        0.02169595, 0.02104024, 0.0201913 , 0.02098918, 0.0252691 ],
       [0.01401854, 0.01519072, 0.02062858, 0.02078871, 0.02069796,
        0.02169595, 0.01877324, 0.02584995, 0.02414699, 0.02508363],
       [0.01997662, 0.01848179, 0.02062858, 0.02275201, 0.01975328,
        0.01931756, 0.02566899, 0.02584995, 0.02519147, 0.02508363],
       [0.01997662, 0.02065543, 0.01635433, 0.01752375, 0.02255032,
        0.02576256, 0.02576471, 0.02615343, 0.0252151 , 0.02634802],
       [0.02006628, 0.01995645, 0.01635433, 0.02071472, 0.02255032,
        0.02199314, 0.02280192, 0.02158698, 0.02769558, 0.02634802],
       [0.02018143, 0.01995645, 0.0213696

In [27]:
columns_of_frequencies = []
for i in range(0,373,1):
  columns_of_frequencies.append("frq" + str(i))

In [28]:
df, res_check = make_pandas_dataframe(path_samples_1, '1_ang20231028t101421_014_L2A_OE_main_27577724_RFL_ORT.tif', columns_of_frequencies, "shit", 1, 4.5, 6.5)

In [29]:
df

Unnamed: 0,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,frq9,...,frq367,frq368,frq369,frq370,frq371,frq372,Label,Shape,File_UID_Num,File
0,0.018910,0.025453,0.026801,0.029931,0.025996,0.030246,0.033545,0.035334,0.039591,0.044717,...,0.154073,0.158080,0.160214,0.164343,0.173604,0.190160,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
1,0.013469,0.015442,0.018657,0.021459,0.028462,0.029567,0.035124,0.038447,0.039005,0.042359,...,0.146481,0.146843,0.155816,0.159489,0.157601,0.169750,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
2,0.013482,0.013153,0.018295,0.021435,0.028463,0.031020,0.033894,0.037844,0.039349,0.043371,...,0.146007,0.148808,0.149374,0.162757,0.157468,0.167964,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
3,0.013058,0.024265,0.024345,0.023020,0.027349,0.033271,0.032494,0.036257,0.040206,0.042641,...,0.150941,0.147344,0.155273,0.154464,0.140058,0.165022,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
4,0.015231,0.024812,0.023221,0.024547,0.028005,0.033780,0.032661,0.035639,0.040920,0.043962,...,0.157301,0.157921,0.164166,0.162148,0.151324,0.178857,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.021151,0.024380,0.035239,0.033584,0.035645,0.041016,0.043252,0.047873,0.051074,0.052111,...,0.138150,0.141279,0.152104,0.160080,0.172503,0.170343,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
96,0.017616,0.025192,0.034934,0.032443,0.034407,0.042204,0.044833,0.047170,0.051684,0.052533,...,0.137173,0.142190,0.151477,0.156132,0.159610,0.163451,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
97,0.034693,0.031002,0.033019,0.039256,0.047065,0.042934,0.044410,0.048026,0.050822,0.058837,...,0.131137,0.131562,0.131597,0.130821,0.129872,0.157986,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...
98,0.033191,0.030726,0.032654,0.039991,0.045592,0.042009,0.044954,0.046063,0.050875,0.058095,...,0.124726,0.126619,0.135199,0.133600,0.138676,0.151029,shit,"(10, 10)",1,1_ang20231028t101421_014_L2A_OE_main_27577724_...


In [61]:
temp_arr = arr[:,1,1]

In [62]:
test_arr = convert_3D_to_1D(arr)

In [63]:
test_arr.shape

(100, 373)

In [64]:
np.array_equal(temp_arr, test_arr[11])

True

In [None]:
temp = get_labels(all_labels[1], col1=label_cols[1][0], col2=label_cols[1][1])
temp

Unnamed: 0,Sample_num,Label
0,2408,Annual Crops (e.g wheat)
1,2409,Annual Crops (e.g wheat)
2,2829,"Permanent Crops (e.g., vineyard)"
3,3408,Shrubs
4,3469,Shrubs
...,...,...
107,7985,"Permanent Crops (e.g., vineyard)"
108,7986,"Permanent Crops (e.g., vineyard)"
109,8192,Shrubs
110,8226,Planted Forest


In [None]:
label_cols = [('Sample_num','Class.4'), ('Sample_num','Class.3'),('Sample_num','Class.4')]

In [None]:
label_cols[1][1]

'Class.3'

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
print(df.columns)

Index(['frq0', 'frq1', 'frq2', 'frq3', 'frq4', 'frq5', 'frq6', 'frq7', 'frq8',
       'frq9',
       ...
       'frq367', 'frq368', 'frq369', 'frq370', 'frq371', 'frq372', 'Label',
       'Shape', 'File_UID_Num', 'File'],
      dtype='object', length=377)


In [None]:
multiprocessing.cpu_count()

2

## Code for others

In [148]:
def load_data(sample_path):
  df = pd.read_csv(sample_path) # read in the data from a csv
  df.rename(columns={df.columns[0]: 'img_pxl_index'}, inplace=True) # names an unnamed column
  df['Shape'] = df['Shape'].apply(literal_eval) # convert the string tuples to actual tuples.
  df['img_pos'] = df[['Shape','img_pxl_index']].apply(add_img_lvl_pixel_loc)
  return df

In [145]:
def add_img_lvl_pixel_loc(df):
  shape = df.iloc[1]
  index = df.iloc[0]
  print(df)
  assert(len(shape) == 2)
  return (index // shape[1], index % shape[1])

In [128]:
df = load_data(path_to_save_sample_csv)

In [136]:
add_img_lvl_pixel_loc((10,10), 0)

(0, 0)

In [149]:
df['img_pos'] = df['Shape','img_pxl_index'].apply(add_img_lvl_pixel_loc)

KeyError: ('Shape', 'img_pxl_index')

In [154]:
df[['Shape','img_pxl_index']].iloc[0].iloc[0]

(10, 10)