##Setup

###Installs

In [53]:
!pip install rasterio
!pip install raster2xyz



###Imports

In [54]:
import rasterio
from rasterio.plot import show
from raster2xyz.raster2xyz import Raster2xyz
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

###Connect Google Drive

In [55]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### File Paths

In [None]:
#Directories
main_dir = '/content/drive/Shareddrives/Land_Classification_Training_shared/Land_Classification_1D/' # The Main directory you are working in
samples_dir = 'avirisng_sample_subset/'
csv_dir = 'valid_samples_1d/'

#Files
labels_file = 'Sample_Boxes_Labels/Majority_Labeling.xlsx_-_Sheet1.csv'
csv_file_name = 'samples_3.csv'                                                   # CHANGE NAME TO NOT OVERWRITE

#Paths
path_labels = join(main_dir, labels_file) #file
path_samples = join(main_dir, samples_dir) #directory
path_to_save_csv = join(main_dir, csv_dir) #directory

##Functions


In [76]:
def get_labels(file_path):
  '''
  TBD
  '''
  return pd.read_csv(path_labels)[['Sample_num', 'Class.4']].rename(columns={"Sample_num": "Sample_num", "Class.4": "Label"})

In [77]:
def trim_data_files(filenames, labels):
  '''
  TBD
  '''
  # Init's
  trimmed_filenames = []
  size_label = labels.shape[0]
  size_filenames = filenames.size
  count_label = 0
  count_filenames = 0

  while(count_label != size_label and count_filenames != size_filenames):       # While we have not seen all labels and we have not seen all filenames
    filename_num = int(filenames[count_filenames].split('_')[0])                ## this gets the first value of the file name (filename_num) seperated at '_'
    label_num = labels['Sample_num'][count_label].astype(int)                   ## gets the current sample number in the label array at count_label
    if (filename_num == label_num):                                             ## Checks to see if the current file is the label 'count_label'
      trimmed_filenames.append(filenames[count_filenames])                      ### if it is then append the file name to the valid, labeled, filenames list
      count_filenames += 1                                                      ### increment count_filenames +1
    elif (filename_num > label_num):                                            ## Check if filename_num is greater than the sample number in labels at count_label
      count_label += 1                                                          ### if it is then count_label needs to increment +1 as there are no more files that start with that value (sorted lists)
    elif (filename_num < label_num):                                            ## Check if he filename_num is less than the current label number at count_label
      count_filenames += 1                                                      ### if it is then increment count_filenames += 1 as there is no label for it.
    else:                                                                       ## Something has gone horribly wrong
      raise Exception("Something is very wrong in trim_data_files.\nValues: " + size_label + ", " + size_filenames + ", " + count_label + ", " + count_filenames + ", " )
  return np.array(trimmed_filenames)

In [78]:
def check_res(res, min_res_bound=4.5, max_res_bound=6.5):
  '''
  Description:
    Checks to see if the x/y resolution of a pixel is within the min/max bound.
  Input:
    res     : tuple of the x and y resolution of a pixel value
  Output:
    Bool    : True if with in min/max resolution bounds
  '''
  if (res[0] < min_res_bound) or (res[0] > max_res_bound) or (res[1] < min_res_bound) or (res[1] > max_res_bound):
    return False
  return True

In [79]:
def tiff_to_arr(filepath, min_res=4.5, max_res=6.5):
  '''
  Description:
    This function takes a filepath to a .tiff file, opens it, and reads it as a
    numpy arr. Then returns said array.
  Input:
    filepath  : The file path to the .tiff file, starting from /content/...
  Output:
    data_3D   : A 3 dimensional array of frequency bands for the pixels of an
                image.
    Bool      : is this an image with a valid resolution
  '''
  with rasterio.open(filepath) as dataset:
    if (                                                                        # you could really just put the res check if here... - sam to sam
        check_res(
            dataset.res,
            min_res_bound = min_res,
            max_res_bound = max_res
            )
        ):
      return dataset.read(), True, str(dataset.shape)
  return np.array([]), False, ""

In [80]:
def convert_3D_to_1D(data_3D):
  '''
  Description:
    This function takes a 3 dimensional array of frequence bands when each
    individual frequence reading is a NxN 2D array. So this 3D array is BxNxN
    where B is the number of frequence bands. This function will return a N*NxB
    array. Where every individual frequence corresponding to a pixel is in the
    returned 1D array for each of the N*N pixels.
  Input:
    data_3D         : Numpy Array with 3 dimensions of shape (num_band, num_row, num_col)
  Output:
    bands_per_pixel : Numpy Array of shape (num_row * numcol, num_band)
  '''
  temp_list_1D_arr = []

  # Access the depth (third dimension) and create 1D arrays
  for i in range(data_3D.shape[1]):                 # 10 for both data_3D.shape[1] & data_3D.shape[2] to make the 10x10
    for j in range(data_3D.shape[2]):
      data_1D = data_3D[:, i, j].flatten()          # EX. this will take the [0,0] for every bands then flatten that into a 1D array. For all the bands corresponding to pixel [0,0]
      temp_list_1D_arr.append(data_1D)              # append to the temp list
  return np.array(temp_list_1D_arr)      # convert the list to a numpy array... because I want to.     NOTE: MAYBE HAVE THIS RETURN A PANDA'S DATAFRAME? -SAM

In [81]:
def get_filenames(directory_path):
  '''
   * Description:
   *   gets the name of both files and directories at path_samples
   *   joins the path and the file names then checks if that is a file
   *   if it is a file and not a directory it is added to the list filenames
   *   once the list of file names is created, it is then sorted by number: '#_...'
   *   the list is then converted to numpy array just cause.
   *
   *   convert_to_numpy_array(sort(get_list_of_only_filenames, sort_by_first_number_in_name))
   * Input:
   *   directory_path   : a string that is the file path to the directory
   *                      you want a filename list from.
   * Output:
   *   Sorted Numpy Array of Filenames, array of strings
  '''
  return np.array(sorted([f for f in listdir(directory_path) if isfile(join(directory_path, f))], key=lambda x: int(x.split("_")[0])))

In [82]:
def make_pandas_dataframe(dir_path, filename, col_labels, label=pd.NA, min_res=4.5, max_res=6.5):
  '''
  TBD
  '''
  arr, res_check, shape = tiff_to_arr(join(dir_path, filename), min_res=min_res, max_res=max_res)
  #if filename is in label data for labeling.
  if (res_check):
    ds = convert_3D_to_1D(arr)
    df = pd.DataFrame(ds, columns=col_labels)
    df['Label'] = label
    df['Shape'] = shape
    df['File'] = filename
    return df, True
  return arr, False #if it's false then arr will be an empty numpy array

In [83]:
def get_all_data(sample_directory_path, min_res=4.5, max_res=6.5):
  '''
  TBD
  '''
  #A list of files that are valid and used to make the final dataframe
  included_files = []

  #Creates the frequency labels
  columns_of_frequencies = []
  for i in range(0,373,1):
    columns_of_frequencies.append("frq" + str(i))

  # get an array of the sample file names
  filenames = get_filenames(sample_directory_path)

  # get labeldataframe
  labels = get_labels(path_labels)

  # trim the files to the ones with labels
  trim_filenames = trim_data_files(filenames, labels)

  #loop through and add to pandas dataframe
  list_df = []
  for i in range(0, len(trim_filenames)):

    sample_num = int(trim_filenames[i].split('_')[0])                           #get the sample number for the label
    label = labels[labels['Sample_num'] == sample_num]['Label'].values[0]       #gets the string label with the sample number

    #get the dataframe for this specific file
    df, res_check = make_pandas_dataframe(
        sample_directory_path,
        trim_filenames[i],
        columns_of_frequencies,
        label,
        min_res = min_res,
        max_res = max_res
        )

    if(res_check):                                                              #check to see if the file has a valid resolution
      list_df.append(df)
      included_files.append(trim_filenames[i])

  return pd.concat(list_df), np.array(included_files)

##Run

In [84]:
#So Slow...
df, valid_files = get_all_data(path_samples) #this took 31 minutes to run... or 41 minutes

In [86]:
# Lets check it out.
df

Unnamed: 0,frq0,frq1,frq2,frq3,frq4,frq5,frq6,frq7,frq8,frq9,...,frq366,frq367,frq368,frq369,frq370,frq371,frq372,Label,Shape,File
0,0.018910,0.025453,0.026801,0.029931,0.025996,0.030246,0.033545,0.035334,0.039591,0.044717,...,0.154068,0.154073,0.158080,0.160214,0.164343,0.173604,0.190160,Unconsolidated Barren,"(10, 10)",1_ang20231028t101421_014_L2A_OE_main_27577724_...
1,0.013469,0.015442,0.018657,0.021459,0.028462,0.029567,0.035124,0.038447,0.039005,0.042359,...,0.143804,0.146481,0.146843,0.155816,0.159489,0.157601,0.169750,Unconsolidated Barren,"(10, 10)",1_ang20231028t101421_014_L2A_OE_main_27577724_...
2,0.013482,0.013153,0.018295,0.021435,0.028463,0.031020,0.033894,0.037844,0.039349,0.043371,...,0.145602,0.146007,0.148808,0.149374,0.162757,0.157468,0.167964,Unconsolidated Barren,"(10, 10)",1_ang20231028t101421_014_L2A_OE_main_27577724_...
3,0.013058,0.024265,0.024345,0.023020,0.027349,0.033271,0.032494,0.036257,0.040206,0.042641,...,0.149051,0.150941,0.147344,0.155273,0.154464,0.140058,0.165022,Unconsolidated Barren,"(10, 10)",1_ang20231028t101421_014_L2A_OE_main_27577724_...
4,0.015231,0.024812,0.023221,0.024547,0.028005,0.033780,0.032661,0.035639,0.040920,0.043962,...,0.158802,0.157301,0.157921,0.164166,0.162148,0.151324,0.178857,Unconsolidated Barren,"(10, 10)",1_ang20231028t101421_014_L2A_OE_main_27577724_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.046806,0.051411,0.059857,0.063429,0.066798,0.071768,0.072838,0.076341,0.080804,0.084892,...,0.012441,0.014959,0.019577,0.011943,0.015404,0.021072,0.027756,Waterbodies,"(8, 8)",12128_ang20231031t092656_025_L2A_OE_main_27577...
60,0.046862,0.052788,0.061928,0.065137,0.069281,0.072774,0.073955,0.078714,0.082927,0.088494,...,0.016511,0.012616,0.015370,0.018824,0.028074,0.047163,0.031902,Waterbodies,"(8, 8)",12128_ang20231031t092656_025_L2A_OE_main_27577...
61,0.044689,0.049277,0.055507,0.061436,0.064873,0.068337,0.069823,0.074879,0.078751,0.082332,...,0.012599,0.012702,0.017404,0.018361,0.018821,0.030842,0.078079,Waterbodies,"(8, 8)",12128_ang20231031t092656_025_L2A_OE_main_27577...
62,0.047958,0.051963,0.058101,0.065782,0.067146,0.070770,0.073709,0.077327,0.081112,0.085983,...,0.014440,0.018656,0.015339,0.018846,0.030422,0.048251,0.034640,Waterbodies,"(8, 8)",12128_ang20231031t092656_025_L2A_OE_main_27577...


In [None]:
#Check Number of NaNs
columns_of_frequencies = []
for i in range(0,373,1):
  columns_of_frequencies.append("frq" + str(i))

nan_counts = df[columns_of_frequencies].isna().sum().sum()
nan_counts

5723685

In [87]:

df.to_csv(join(path_to_save_csv, csv_file_name)) #Saves to CSV

In [75]:
with rasterio.open(join(path_samples, '1_ang20231028t101421_014_L2A_OE_main_27577724_RFL_ORT.tif')) as ds:
    print("resolution: ", ds.res)
    print("Shape: ", ds.shape)

resolution:  (4.9, 4.9)
Shape:  (10, 10)
