In [1]:
#| default_exp imgs_stats

# Statistics of images

In [2]:
#|hide
from nbdev.showdoc import *

In [3]:
#|export

import rasterio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from tqdm import tqdm
import itertools as it

from flood_exercise import utils_func
from flood_exercise import const_vals as CONST


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
#|export

class ImgsStatistics():

  def __init__(self,
      path_to_imgs : str , # path to the folder that contains the images
      path_to_split_file : str , # path to the folder that contains the split data
      ):
      
      # get the path to the split files
      self.path_to_split_file = utils_func.load_list_paths(path_to_split_file,filter_file=False)

      # get the tiles paths
      self.list_of_files = utils_func.load_list_paths(path_to_imgs,filter_file = True)
      self.img_stats_df = self._iterate_tiles_()

      #get the split data 
      self.split_dfs = self._get_split_data_()

      # join between the split data and the images stats
      self.results = self.img_stats_df.merge(self.split_dfs, on=CONST.JOIN_COL_NAME,
                                        how = 'left').drop_duplicates(subset = CONST.PATH_STR)
      self.results.drop(CONST.DF_ID_COL_NAME,axis=1,inplace=True)
      self.results.reset_index(inplace=True)

      self.count_regions = pd.DataFrame(self.results[CONST.REGION_STR].value_counts())
      self.count_split = pd.DataFrame(self.results[CONST.SPLIT_COL_NAME].value_counts())
      

  def _get_split_data_(self):
      """ 
     Takes list of paths to images from the type  and extract the split group.
     
     Parameters:
     self
     Returns:
     string: pd.DataFrame contains all the paths, image id and the split group.
      """

      self.collect_dfs = []
      for split_path in self.path_to_split_file:
         # get the split name
         split_type_str = split_path.split(CONST.SPLIT_TILES_NAMES_STR1)[-1].split(CONST.SPLIT_TILES_NAMES_STR2)[1]

         # open the csv file and add column with the split group
         df = pd.read_csv(split_path,header=None)
         # to be caution - take the two columns and make one column with all the files names
         df =pd.DataFrame(sorted(it.chain(*df.values)))

         df.columns = [CONST.DF_ID_COL_NAME]

         df[CONST.SPLIT_COL_NAME] = split_type_str
         self.collect_dfs.append(df)

         split_dfs = pd.concat(self.collect_dfs)

         split_dfs[CONST.JOIN_COL_NAME]= split_dfs[CONST.DF_ID_COL_NAME].str.split(CONST.SPLIT_TILES_NAMES_STR2).str[:2].str.join(CONST.SPLIT_TILES_NAMES_STR2)
         

      return split_dfs


  def _get_region_name_(self,
                        tile_name : str , # path of image , assuming that the first word in the file name is the region name
                        ):
     """
     Takes a string of images from the type "region_id_label.tif and extract the region name.
     for example, for "Bolivia_23014_S2Hand.tif" it will return Bolivia
     Parameters:
     folder_path (str): The path to the folder containing TIFF files.

     Returns:
     string: Name of the region , extracted from the tile name 
     """
     region = tile_name.split(CONST.SPLIT_TILES_NAMES_STR1)[-1].split(CONST.SPLIT_TILES_NAMES_STR2)[0]
     return region
  
  def _img_statistics_(self,
                       path : str , # path to image (tif file)
                       ):

     """
     recieves path of image, returns statistics per image 
     Parameters:
     path (str): The path to the TIFF file.

     Returns:
     pd.DataFrame  with statistics for the given image 
     """                      
     

     with rasterio.open(path) as src:
      
      src_arr = src.read()

      #convert 0 to nan , assuming 0 is no value and we don't want it to interrupt the staitistics
      src_arr  = np.where(src_arr==0, np.nan, src_arr )
      band_names = list(src.descriptions)

      collect_bands_stats = {}

      for band_name , index in zip(band_names,range(0,src.read().shape[0])):
         self.bands_stats_imgs = {CONST.STR_BAND_NAME : [] , CONST.STR_MEAN : [] , CONST.STR_STD : [] }
         arr = src_arr[index,:,:]

         #calculate mean
         mean = np.nanmean(arr)
         
         #calculate std
         std = np.nanstd(arr)

         collect_bands_stats[band_name +'_' + CONST.STR_MEAN] = mean
         collect_bands_stats[band_name +'_' + CONST.STR_STD] = std
         
   
     df_img_stats = pd.DataFrame.from_dict([collect_bands_stats])

     return df_img_stats


  def _iterate_tiles_(self):

   self.collect_info = {CONST.PATH_STR : [] , CONST.REGION_STR : []}
   self.collect_stats = []

   for path in tqdm(self.list_of_files):
      #get the region name using the get_region_name function
      region = self._get_region_name_(path)
      
      #collect the regn name and the path for the final table
      self.collect_info[CONST.PATH_STR].append(path)
      self.collect_info[CONST.REGION_STR].append(region)
         
      #get the image statistics
      df_img_stats = self._img_statistics_(path)
      self.collect_stats.append(df_img_stats)

   #organize datasets
   #organize the region info
   df1 = pd.DataFrame(self.collect_info)
   #ordanize the stats
   df2 = pd.concat(self.collect_stats)
   df2.reset_index(inplace=True)
   #cocatenate
   results = pd.concat([df1,df2],axis=1)
   
   #add names column so we can join between the split type and the image name 
   results[CONST.JOIN_COL_NAME] = results[CONST.PATH_STR].str.split(CONST.SPLIT_TILES_NAMES_STR1).str[-1].str.split(CONST.SPLIT_TILES_NAMES_STR2).str[:2].str.join(CONST.SPLIT_TILES_NAMES_STR2)

   return results



In [5]:
instance = ImgsStatistics(
  path_to_imgs = r'D:\git\flood_exercise\S2',
  path_to_split_file= r'D:\git\flood_exercise\split\flood_handlabeled' )

instance.results

  mean = np.nanmean(arr)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
100%|██████████| 399/399 [01:05<00:00,  6.08it/s]


Unnamed: 0,level_0,path,region,index,B1_mean,B1_std,B2_mean,B2_std,B3_mean,B3_std,...,B9_mean,B9_std,B10_mean,B10_std,B11_mean,B11_std,B12_mean,B12_std,name_for_join,split
0,0,D:\git\flood_exercise\S2\Nigeria_417184_S2Hand...,Nigeria,0,2045.121361,366.960304,1839.157126,391.249278,1811.085734,379.863595,...,953.190210,387.732510,159.752287,74.063205,2718.732305,552.134430,2072.346645,544.698418,Nigeria_417184,test
1,2,D:\git\flood_exercise\S2\Mekong_1396181_S2Hand...,Mekong,0,1572.750298,238.176893,1345.034756,289.717047,1332.743843,317.775426,...,243.451187,109.609274,37.857872,34.112022,1040.482773,586.654512,626.226967,345.350598,Mekong_1396181,train
2,4,D:\git\flood_exercise\S2\Mekong_1191208_S2Hand...,Mekong,0,2013.734406,1151.946099,1772.196365,1256.893949,1716.215126,1234.393483,...,371.368935,187.097016,9.684616,2.510644,2307.633781,1232.076664,1301.098717,1043.084131,Mekong_1191208,train
3,6,D:\git\flood_exercise\S2\Mekong_1248200_S2Hand...,Mekong,0,1442.364147,138.868240,1294.289673,217.404070,1417.947430,285.272892,...,108.150059,67.019372,5.705055,1.241582,596.679100,549.332932,323.786411,281.757341,Mekong_1248200,train
4,8,D:\git\flood_exercise\S2\India_900498_S2Hand.tif,India,0,1489.600925,118.475460,1286.906940,166.706857,1264.285282,187.111246,...,102.980587,43.379124,7.087765,1.656915,1049.212807,853.856596,665.331821,590.233659,India_900498,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,788,D:\git\flood_exercise\S2\USA_741178_S2Hand.tif,USA,0,1335.008938,242.002314,1156.158051,366.925566,1165.635044,378.320440,...,963.875763,149.946729,12.606197,1.407922,2408.852219,713.517056,1494.073376,675.028190,USA_741178,valid
395,790,D:\git\flood_exercise\S2\USA_761032_S2Hand.tif,USA,0,1211.460930,60.073873,957.013645,101.266215,960.577333,103.557143,...,945.488744,209.771066,10.903004,1.085587,1821.383298,473.622415,931.870198,354.322564,USA_761032,valid
396,792,D:\git\flood_exercise\S2\USA_955053_S2Hand.tif,USA,0,1277.548512,59.184389,1032.082386,102.587972,992.515831,108.127801,...,683.315823,231.303370,13.719379,5.158683,1724.935505,637.995502,954.246548,425.050670,USA_955053,train
397,794,D:\git\flood_exercise\S2\USA_986268_S2Hand.tif,USA,0,1244.628212,48.008136,994.853779,91.446819,996.123226,104.907798,...,808.566460,144.699138,10.382317,1.186718,1949.497345,432.971810,1000.108887,277.715240,USA_986268,valid


In [7]:
instance.split_dfs

Unnamed: 0,id,split,name_for_join
0,Ghana_1078550_LabelHand.tif,test,Ghana_1078550
1,Ghana_1078550_S1Hand.tif,test,Ghana_1078550
2,Ghana_141271_LabelHand.tif,test,Ghana_141271
3,Ghana_141271_S1Hand.tif,test,Ghana_141271
4,Ghana_167233_LabelHand.tif,test,Ghana_167233
...,...,...,...
173,USA_761032_S1Hand.tif,valid,USA_761032
174,USA_826217_LabelHand.tif,valid,USA_826217
175,USA_826217_S1Hand.tif,valid,USA_826217
176,USA_986268_LabelHand.tif,valid,USA_986268


In [None]:
instance.count_regions

In [None]:
instance.count_split

In [None]:
# instance.results.to_csv(r'D:\git\flood_exercise\RESULTS\imgs_stats.csv')
# instance.count_split.to_csv(r'D:\git\flood_exercise\RESULTS\count_split.csv')
# instance.count_regions.to_csv(r'D:\git\flood_exercise\RESULTS\count_regions.csv')