<a href="https://colab.research.google.com/github/sergioGarcia91/ML_Carolina_Bays/blob/main/02b_Dataset_creation_64x64.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, images of the AOI in TIFF format are loaded and processed by dividing them into fragments of a specific size in pixels. The goal is to generate a structured dataset for training and testing in machine learning models.

# Start

In [None]:
# Install required libraries
!pip install rasterio



In [None]:
!pip install geopandas



In [None]:
!pip install pyproj



In [None]:
import numpy as np
import rasterio
import os
import time
import geopandas as gpd
import tarfile
import tempfile
import shutil
import matplotlib.pyplot as plt
import h5py

from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
from rasterio.transform import from_bounds
from rasterio.merge import merge
from pyproj import CRS
from IPython.display import clear_output

In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Get folder

In [None]:
# Folder with TIF of the 4 AOIs
path_AOI_folder = '/content/drive/MyDrive/UIS/Doctorado_UIS2198589/1_semestre/TopicosAvanzadosGeofisica/FC_CarolinaBais/Dataset_AOI'

# Total of TIFs files o 4 AOIs
files_AOI_all = [file_AOI for file_AOI in os.listdir(path_AOI_folder) if 'merge' in file_AOI]

len(files_AOI_all)

420

## For loop AOI

In [None]:
AOI_filter = 'AOI_04'

files_AOI = [file_AOI for file_AOI in files_AOI_all if AOI_filter in file_AOI]

dict_AOI = {'AOI_01_X': [],
            'AOI_01_y': [],
            'AOI_01_FC_percentaje': [],
            'AOI_02_X': [],
            'AOI_02_y': [],
            'AOI_02_FC_percentaje': [],
            'AOI_03_X': [],
            'AOI_03_y': [],
            'AOI_03_FC_percentaje': [],
            'AOI_04_X': [],
            'AOI_04_y': [],
            'AOI_04_FC_percentaje': [],}

plot_img = False

box_pixels = 64
int_steps = 2
delta_pixels = int(box_pixels/int_steps)

total_files = len(files_AOI)
total_files_processed = 1

for file_AOI_name in files_AOI[:]:
  print(f'Processing file {total_files_processed} of {total_files}')
  total_files_processed += 1

  path_file_X = os.path.join(path_AOI_folder, file_AOI_name)
  y_file = file_AOI_name[0:16] + 'Y_mask.TIF'
  path_file_y = os.path.join(path_AOI_folder, y_file)

  with rasterio.open(os.path.join(path_AOI_folder, files_AOI[0])) as src:
    data_X = src.read()

  with rasterio.open(os.path.join(path_AOI_folder, y_file)) as src:
    data_y = src.read()

  rows_ = data_X.shape[1]
  cols_ = data_X.shape[2]

  for i in range(0, rows_-box_pixels, delta_pixels):
    for j in range(0, cols_-box_pixels, delta_pixels):
      #print(i, j)
      #print(data[2, i:i+box_pixels ,j:j+box_pixels].shape)
      y_size = data_y[0, i:i+box_pixels ,j:j+box_pixels].shape[0] * data_y[0, i:i+box_pixels ,j:j+box_pixels].shape[1]
      y_fc = data_y[0, i:i+box_pixels ,j:j+box_pixels].sum()
      percentaje = y_fc * 100 / y_size

      dict_AOI[file_AOI_name[0:6]+'_X'].append(data_X[:,i:i+box_pixels ,j:j+box_pixels])
      dict_AOI[file_AOI_name[0:6]+'_y'].append(data_y[0, i:i+box_pixels ,j:j+box_pixels])
      dict_AOI[file_AOI_name[0:6]+'_FC_percentaje'].append(percentaje)

      if total_files_processed % 10 == 0:
        clear_output(wait=True)

      if plot_img:
        rgb_image = np.dstack((data_X[3][i:i+box_pixels ,j:j+box_pixels],
                              data_X[2][i:i+box_pixels ,j:j+box_pixels],
                              data_X[1][i:i+box_pixels ,j:j+box_pixels]))

        fig, ax = plt.subplots(figsize=(10, 5), nrows=1, ncols=2)
        #ax[0].imshow(data_X[2, i:i+box_pixels ,j:j+box_pixels], vmin=0, vmax=0.3)
        ax[0].imshow(rgb_image, vmin=0, vmax=0.4)
        ax[1].imshow(data_y[0, i:i+box_pixels ,j:j+box_pixels])

        ax[0].set_title(f'Size: {y_size}')
        ax[1].set_title(f'FC: {y_fc} - {percentaje:.2f}%')
        plt.show()



Processing file 100 of 105
Processing file 101 of 105
Processing file 102 of 105
Processing file 103 of 105
Processing file 104 of 105
Processing file 105 of 105


## Dict to arrays

In [None]:
print(AOI_filter)
print('\n')

for key in dict_AOI.keys():
  print(type(dict_AOI[key]))
  dict_AOI[key] = np.array(dict_AOI[key])
  print(type(dict_AOI[key]))


AOI_04


<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>


In [None]:
dict_AOI.keys()

dict_keys(['AOI_01_X', 'AOI_01_y', 'AOI_01_FC_percentaje', 'AOI_02_X', 'AOI_02_y', 'AOI_02_FC_percentaje', 'AOI_03_X', 'AOI_03_y', 'AOI_03_FC_percentaje', 'AOI_04_X', 'AOI_04_y', 'AOI_04_FC_percentaje'])

In [None]:
dict_AOI[AOI_filter+'_X'].shape

(94500, 7, 64, 64)

In [None]:
dict_AOI[AOI_filter+'_X'][0].shape

(7, 64, 64)

In [None]:
dict_AOI[AOI_filter+'_y'][0].shape

(64, 64)

In [None]:
dict_AOI[AOI_filter+'_y'][0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
dict_AOI[AOI_filter+'_y'][0].sum()

0.0

In [None]:
dict_AOI[AOI_filter+'_FC_percentaje'][0]

0.0

# Convert to h5

In [None]:
path_save_h5 = '/content/drive/MyDrive/UIS/Doctorado_UIS2198589/1_semestre/TopicosAvanzadosGeofisica/FC_CarolinaBais/Dataset_h5'

# Save to an HDF5 file
with h5py.File(f"{path_save_h5}/dataset_{AOI_filter}_64x64.h5", "w") as h5f:
  for key, value in dict_AOI.items():
    if AOI_filter in key:
      if isinstance(value, str):  # If it's a string, store it as an attribute
        h5f.attrs[key] = value
      else:  # If it's an array, store it as a dataset
        h5f.create_dataset(key, data=value)

print("HDF5 file saved successfully.")

HDF5 file saved successfully.


In [None]:
os.listdir(path_save_h5)

['dataset_AOI_01_32x32.h5',
 'dataset_AOI_02_32x32.h5',
 'dataset_AOI_03_32x32.h5',
 'dataset_AOI_04_32x32.h5',
 'dataset_AOI_02_64x64.h5',
 'dataset_AOI_01_64x64.h5',
 'dataset_AOI_03_64x64.h5',
 'dataset_AOI_04_64x64.h5']

In [None]:
# Listar archivos con su tama√±o
for file_ in os.listdir(path_save_h5):
    file_path = os.path.join(path_save_h5, file_)
    if os.path.isfile(file_path):  # Verifica que sea un archivo
        size_bytes = os.path.getsize(file_path)
        size_kb = size_bytes / 1024
        size_mb = size_kb / 1024
        print(f"{file_}: {size_bytes} bytes ({size_kb:.2f} KB, {size_mb:.2f} MB)")


dataset_AOI_01_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB)
dataset_AOI_02_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB)
dataset_AOI_03_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB)
dataset_AOI_04_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB)
dataset_AOI_02_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB)
dataset_AOI_01_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB)
dataset_AOI_03_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB)
dataset_AOI_04_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB)


# End