In [1]:
#| default_exp imgs_stats

# Statistics of images

In [2]:
#|hide
from nbdev.showdoc import *

In [9]:
#|export

import rasterio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from tqdm import tqdm
import itertools as it

from flood_exercise import utils_func
from flood_exercise import const_vals as CONST


In [13]:
#|export

class ImgsStatistics():

  def __init__(self,
      path_to_imgs : str , # path to the folder that contains the images
      path_to_split_file : str , # path to the folder that contains the split data
      ):
      
      # get the path to the split files
      self.path_to_split_file = utils_func.load_list_paths(path_to_split_file,filter_file=False)

      # get the tiles paths
      self.list_of_files = utils_func.load_list_paths(path_to_imgs,filter_file = True)
      self.results = self._iterate_tiles_()

      #get the split data 
      self.split_dfs = self._get_split_data_()

  def _get_split_data_(self):

      self.collect_dfs = []
      for split_path in self.path_to_split_file:
         # get the split name
         split_type_str = split_path.split(CONST.SPLIT_TILES_NAMES_STR1)[-1].split(CONST.SPLIT_TILES_NAMES_STR2)[1]

         # open the csv file and add column with the split group
         df = pd.read_csv(split_path,header=None)
         # to be caution - take the two columns and make one column with all the files names
         df =pd.DataFrame(sorted(it.chain(*df.values)))

         df.columns = [CONST.DF_ID_COL_NAME]

         df[CONST.SPLIT_COL_NAME] = split_type_str
         self.collect_dfs.append(df)

         split_dfs = pd.concat(self.collect_dfs)

      return split_dfs


  def _get_region_name_(self,
                        tile_name : str , # path of image , assuming that the first word in the file name is the region name
                        ):
     """
     Takes a string of images from the type "region_id_label.tif and extract the region name.
     for example, for "Bolivia_23014_S2Hand.tif" it will return Bolivia
     Parameters:
     folder_path (str): The path to the folder containing TIFF files.

     Returns:
     string: Name of the region , extracted from the tile name 
     """
     region = tile_name.split(CONST.SPLIT_TILES_NAMES_STR1)[-1].split(CONST.SPLIT_TILES_NAMES_STR2)[0]
     return region

  

  
  def _img_statistics_(self,
                       path : str , # path to image (tif file)
                       ):
     
     with rasterio.open(path) as src:
      
      src_arr = src.read()

      #convert 0 to nan , assuming 0 is no value and we don't want it to interrupt the staitistics
      src_arr  = np.where(src_arr==0, np.nan, src_arr )
      band_names = list(src.descriptions)

      collect_bands_stats = {}

      for band_name , index in zip(band_names,range(0,src.read().shape[0])):
         self.bands_stats_imgs = {CONST.STR_BAND_NAME : [] , CONST.STR_MEAN : [] , CONST.STR_STD : [] }
         arr = src_arr[index,:,:]

         #calculate mean
         mean = np.nanmean(arr)
         
         #calculate std
         std = np.nanstd(arr)

         collect_bands_stats[band_name +'_' + CONST.STR_MEAN] = mean
         collect_bands_stats[band_name +'_' + CONST.STR_STD] = std
         
   
     df_img_stats = pd.DataFrame.from_dict([collect_bands_stats])

     #add names column so we can join between the split type and the image name 
     df_img_stats[CONST.JOIN_COL_NAME] = df_img_stats[CONST.PATH_STR].str.split(CONST.SPLIT_TILES_NAMES_STR1).str[-1].str.split(CONST.SPLIT_TILES_NAMES_STR2).str[:2].str.join(CONST.SPLIT_TILES_NAMES_STR2)


     return df_img_stats


  def _iterate_tiles_(self):
   
   self.collect_info = {CONST.PATH_STR : [] , CONST.REGION_STR : []}
   self.collect_stats = []

   for path in tqdm(self.list_of_files):
      #get the region name using the get_region_name function
      region = self._get_region_name_(path)
      
      #collect the regn name and the path for the final table
      self.collect_info[CONST.PATH_STR].append(path)
      self.collect_info[CONST.REGION_STR].append(region)
         
      #get the image statistics
      df_img_stats = self._img_statistics_(path)
      self.collect_stats.append(df_img_stats)

   #organize datasets
   #organize the region info
   df1 = pd.DataFrame(self.collect_info)
   #ordanize the stats
   df2 = pd.concat(self.collect_stats)
   df2.reset_index(inplace=True)
   #cocatenate
   results = pd.concat([df1,df2],axis=1)

   return results


In [14]:
instance = ImgsStatistics(
  path_to_imgs = r'D:\git\flood_exercise\S2',
  path_to_split_file= r'D:\git\flood_exercise\split\flood_handlabeled' )


  mean = np.nanmean(arr)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
100%|██████████| 399/399 [00:29<00:00, 13.73it/s]


In [16]:
instance.split_dfs

Unnamed: 0,id,split
0,Ghana_1078550_LabelHand.tif,test
1,Ghana_1078550_S1Hand.tif,test
2,Ghana_141271_LabelHand.tif,test
3,Ghana_141271_S1Hand.tif,test
4,Ghana_167233_LabelHand.tif,test
...,...,...
173,USA_761032_S1Hand.tif,valid
174,USA_826217_LabelHand.tif,valid
175,USA_826217_S1Hand.tif,valid
176,USA_986268_LabelHand.tif,valid


In [22]:
instance.results

Unnamed: 0,path,region,index,B1_mean,B1_std,B2_mean,B2_std,B3_mean,B3_std,B4_mean,...,B8A_std,B9_mean,B9_std,B10_mean,B10_std,B11_mean,B11_std,B12_mean,B12_std,name_for_join
0,D:\git\flood_exercise\S2\Nigeria_417184_S2Hand...,Nigeria,0,2045.121361,366.960304,1839.157126,391.249278,1811.085734,379.863595,1753.377412,...,548.933831,953.190210,387.732510,159.752287,74.063205,2718.732305,552.134430,2072.346645,544.698418,"[D:, git, flood_exercise, S2, Nigeria_417184_S..."
1,D:\git\flood_exercise\S2\Mekong_1396181_S2Hand...,Mekong,0,1572.750298,238.176893,1345.034756,289.717047,1332.743843,317.775426,1168.033314,...,966.515705,243.451187,109.609274,37.857872,34.112022,1040.482773,586.654512,626.226967,345.350598,"[D:, git, flood_exercise, S2, Mekong_1396181_S..."
2,D:\git\flood_exercise\S2\Mekong_1191208_S2Hand...,Mekong,0,2013.734406,1151.946099,1772.196365,1256.893949,1716.215126,1234.393483,1531.988964,...,1380.832087,371.368935,187.097016,9.684616,2.510644,2307.633781,1232.076664,1301.098717,1043.084131,"[D:, git, flood_exercise, S2, Mekong_1191208_S..."
3,D:\git\flood_exercise\S2\Mekong_1248200_S2Hand...,Mekong,0,1442.364147,138.868240,1294.289673,217.404070,1417.947430,285.272892,1388.561562,...,885.005315,108.150059,67.019372,5.705055,1.241582,596.679100,549.332932,323.786411,281.757341,"[D:, git, flood_exercise, S2, Mekong_1248200_S..."
4,D:\git\flood_exercise\S2\India_900498_S2Hand.tif,India,0,1489.600925,118.475460,1286.906940,166.706857,1264.285282,187.111246,1204.147636,...,547.865291,102.980587,43.379124,7.087765,1.656915,1049.212807,853.856596,665.331821,590.233659,"[D:, git, flood_exercise, S2, India_900498_S2H..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,D:\git\flood_exercise\S2\USA_741178_S2Hand.tif,USA,0,1335.008938,242.002314,1156.158051,366.925566,1165.635044,378.320440,1097.614441,...,702.106304,963.875763,149.946729,12.606197,1.407922,2408.852219,713.517056,1494.073376,675.028190,"[D:, git, flood_exercise, S2, USA_741178_S2Han..."
395,D:\git\flood_exercise\S2\USA_761032_S2Hand.tif,USA,0,1211.460930,60.073873,957.013645,101.266215,960.577333,103.557143,685.778290,...,879.109970,945.488744,209.771066,10.903004,1.085587,1821.383298,473.622415,931.870198,354.322564,"[D:, git, flood_exercise, S2, USA_761032_S2Han..."
396,D:\git\flood_exercise\S2\USA_955053_S2Hand.tif,USA,0,1277.548512,59.184389,1032.082386,102.587972,992.515831,108.127801,855.138050,...,921.178796,683.315823,231.303370,13.719379,5.158683,1724.935505,637.995502,954.246548,425.050670,"[D:, git, flood_exercise, S2, USA_955053_S2Han..."
397,D:\git\flood_exercise\S2\USA_986268_S2Hand.tif,USA,0,1244.628212,48.008136,994.853779,91.446819,996.123226,104.907798,765.239330,...,622.396975,808.566460,144.699138,10.382317,1.186718,1949.497345,432.971810,1000.108887,277.715240,"[D:, git, flood_exercise, S2, USA_986268_S2Han..."


In [27]:
instance.results['name_for_join'] = instance.results['path'].str.split('\\').str[-1].str.split('_').str[:2].str.join('_')
instance.results

Unnamed: 0,path,region,index,B1_mean,B1_std,B2_mean,B2_std,B3_mean,B3_std,B4_mean,...,B8A_std,B9_mean,B9_std,B10_mean,B10_std,B11_mean,B11_std,B12_mean,B12_std,name_for_join
0,D:\git\flood_exercise\S2\Nigeria_417184_S2Hand...,Nigeria,0,2045.121361,366.960304,1839.157126,391.249278,1811.085734,379.863595,1753.377412,...,548.933831,953.190210,387.732510,159.752287,74.063205,2718.732305,552.134430,2072.346645,544.698418,Nigeria_417184
1,D:\git\flood_exercise\S2\Mekong_1396181_S2Hand...,Mekong,0,1572.750298,238.176893,1345.034756,289.717047,1332.743843,317.775426,1168.033314,...,966.515705,243.451187,109.609274,37.857872,34.112022,1040.482773,586.654512,626.226967,345.350598,Mekong_1396181
2,D:\git\flood_exercise\S2\Mekong_1191208_S2Hand...,Mekong,0,2013.734406,1151.946099,1772.196365,1256.893949,1716.215126,1234.393483,1531.988964,...,1380.832087,371.368935,187.097016,9.684616,2.510644,2307.633781,1232.076664,1301.098717,1043.084131,Mekong_1191208
3,D:\git\flood_exercise\S2\Mekong_1248200_S2Hand...,Mekong,0,1442.364147,138.868240,1294.289673,217.404070,1417.947430,285.272892,1388.561562,...,885.005315,108.150059,67.019372,5.705055,1.241582,596.679100,549.332932,323.786411,281.757341,Mekong_1248200
4,D:\git\flood_exercise\S2\India_900498_S2Hand.tif,India,0,1489.600925,118.475460,1286.906940,166.706857,1264.285282,187.111246,1204.147636,...,547.865291,102.980587,43.379124,7.087765,1.656915,1049.212807,853.856596,665.331821,590.233659,India_900498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,D:\git\flood_exercise\S2\USA_741178_S2Hand.tif,USA,0,1335.008938,242.002314,1156.158051,366.925566,1165.635044,378.320440,1097.614441,...,702.106304,963.875763,149.946729,12.606197,1.407922,2408.852219,713.517056,1494.073376,675.028190,USA_741178
395,D:\git\flood_exercise\S2\USA_761032_S2Hand.tif,USA,0,1211.460930,60.073873,957.013645,101.266215,960.577333,103.557143,685.778290,...,879.109970,945.488744,209.771066,10.903004,1.085587,1821.383298,473.622415,931.870198,354.322564,USA_761032
396,D:\git\flood_exercise\S2\USA_955053_S2Hand.tif,USA,0,1277.548512,59.184389,1032.082386,102.587972,992.515831,108.127801,855.138050,...,921.178796,683.315823,231.303370,13.719379,5.158683,1724.935505,637.995502,954.246548,425.050670,USA_955053
397,D:\git\flood_exercise\S2\USA_986268_S2Hand.tif,USA,0,1244.628212,48.008136,994.853779,91.446819,996.123226,104.907798,765.239330,...,622.396975,808.566460,144.699138,10.382317,1.186718,1949.497345,432.971810,1000.108887,277.715240,USA_986268


In [None]:
instance.results

In [6]:
test = pd.read_csv(r'D:\git\flood_exercise\split\flood_handlabeled\flood_test_data.csv',header=None)
test

Unnamed: 0,0,1
0,Ghana_313799_S1Hand.tif,Ghana_313799_LabelHand.tif
1,Ghana_1078550_S1Hand.tif,Ghana_1078550_LabelHand.tif
2,Ghana_97059_S1Hand.tif,Ghana_97059_LabelHand.tif
3,Ghana_359826_S1Hand.tif,Ghana_359826_LabelHand.tif
4,Ghana_319168_S1Hand.tif,Ghana_319168_LabelHand.tif
...,...,...
85,USA_1049586_S1Hand.tif,USA_1049586_LabelHand.tif
86,USA_595451_S1Hand.tif,USA_595451_LabelHand.tif
87,USA_670826_S1Hand.tif,USA_670826_LabelHand.tif
88,USA_504150_S1Hand.tif,USA_504150_LabelHand.tif


In [8]:
import itertools as it
pd.DataFrame(sorted(it.chain(*test.values)))

Unnamed: 0,0
0,Ghana_1078550_LabelHand.tif
1,Ghana_1078550_S1Hand.tif
2,Ghana_141271_LabelHand.tif
3,Ghana_141271_S1Hand.tif
4,Ghana_167233_LabelHand.tif
...,...
175,USA_778194_S1Hand.tif
176,USA_905409_LabelHand.tif
177,USA_905409_S1Hand.tif
178,USA_933610_LabelHand.tif


In [None]:
instance.split_dfs

In [None]:
instance.split_type_str 

In [None]:
test_list=utils_func.load_list_paths(r'D:\git\flood_exercise\split\flood_handlabeled',
                           filter_file = False)
test_list

In [None]:
for split_path in test_list:
  print(split_path)

In [None]:
split_path.split('\\')[-1].split('_')[1]

In [None]:
def _get_split_data_(self):
     for split_path in self.list_of_files:
      split_type_str = split_path.split(CONST.SPLIT_TILES_NAMES_STR1)[-1].split(CONST.SPLIT_TILES_NAMES_STR2)[1]
      print(split_type_str) 

In [None]:
instance.results