<a href="https://colab.research.google.com/github/sergioGarcia91/ML_Carolina_Bays/blob/main/02c_Dataset_creation_256x256.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, images of the AOI in TIFF format are loaded and processed by dividing them into fragments of a specific size in pixels. The goal is to generate a structured dataset for training and testing in machine learning models.

# Start

In [None]:
# Install required libraries
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.4.3


In [None]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (5.3 kB)
Collecting pyproj>=3.5.0 (from geopandas)
  Downloading pyproj-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading geopandas-1.1.0-py3-none-any.whl (338 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m338.0/338.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyogrio-0.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (27.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.8/27.8 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyproj-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━

In [None]:
!pip install pyproj



In [None]:
import numpy as np
import rasterio
import os
import time
import geopandas as gpd
import tarfile
import tempfile
import shutil
import matplotlib.pyplot as plt
import h5py

from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
from rasterio.transform import from_bounds
from rasterio.merge import merge
from pyproj import CRS
from IPython.display import clear_output

In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Get folder

In [None]:
# Folder with TIF of the 4 AOIs
path_AOI_folder = '/content/drive/MyDrive/UIS/Doctorado_UIS2198589/1_semestre/TopicosAvanzadosGeofisica/FC_CarolinaBais/Dataset_AOI'

# Total of TIFs files o 4 AOIs
files_AOI_all = [file_AOI for file_AOI in os.listdir(path_AOI_folder) if 'merge' in file_AOI]

len(files_AOI_all)

420

## For loop AOI

In [None]:
AOI_filter = 'AOI_01'

files_AOI = [file_AOI for file_AOI in files_AOI_all if AOI_filter in file_AOI]

dict_AOI = {'AOI_01_X': [],
            'AOI_01_y': [],
            'AOI_01_FC_percentaje': [],
            'AOI_02_X': [],
            'AOI_02_y': [],
            'AOI_02_FC_percentaje': [],
            'AOI_03_X': [],
            'AOI_03_y': [],
            'AOI_03_FC_percentaje': [],
            'AOI_04_X': [],
            'AOI_04_y': [],
            'AOI_04_FC_percentaje': [],}

plot_img = False

box_pixels = 256
int_steps = 4
delta_pixels = int(box_pixels/int_steps)

total_files = len(files_AOI)
total_files_processed = 1

for file_AOI_name in files_AOI[:]:
  print(f'Processing file {total_files_processed} of {total_files}')
  total_files_processed += 1

  path_file_X = os.path.join(path_AOI_folder, file_AOI_name)
  y_file = file_AOI_name[0:16] + 'Y_mask.TIF'
  path_file_y = os.path.join(path_AOI_folder, y_file)

  with rasterio.open(os.path.join(path_AOI_folder, files_AOI[0])) as src:
    data_X = src.read()

  with rasterio.open(os.path.join(path_AOI_folder, y_file)) as src:
    data_y = src.read()

  rows_ = data_X.shape[1]
  cols_ = data_X.shape[2]

  for i in range(0, rows_-box_pixels, delta_pixels):
    for j in range(0, cols_-box_pixels, delta_pixels):
      #print(i, j)
      #print(data[2, i:i+box_pixels ,j:j+box_pixels].shape)
      y_size = data_y[0, i:i+box_pixels ,j:j+box_pixels].shape[0] * data_y[0, i:i+box_pixels ,j:j+box_pixels].shape[1]
      y_fc = data_y[0, i:i+box_pixels ,j:j+box_pixels].sum()
      percentaje = y_fc * 100 / y_size

      dict_AOI[file_AOI_name[0:6]+'_X'].append(data_X[:,i:i+box_pixels ,j:j+box_pixels])
      dict_AOI[file_AOI_name[0:6]+'_y'].append(data_y[0, i:i+box_pixels ,j:j+box_pixels])
      dict_AOI[file_AOI_name[0:6]+'_FC_percentaje'].append(percentaje)

      if total_files_processed % 10 == 0:
        clear_output(wait=True)

      if plot_img:
        rgb_image = np.dstack((data_X[3][i:i+box_pixels ,j:j+box_pixels],
                              data_X[2][i:i+box_pixels ,j:j+box_pixels],
                              data_X[1][i:i+box_pixels ,j:j+box_pixels]))

        fig, ax = plt.subplots(figsize=(10, 5), nrows=1, ncols=2)
        #ax[0].imshow(data_X[2, i:i+box_pixels ,j:j+box_pixels], vmin=0, vmax=0.3)
        ax[0].imshow(rgb_image, vmin=0, vmax=0.4)
        ax[1].imshow(data_y[0, i:i+box_pixels ,j:j+box_pixels])

        ax[0].set_title(f'Size: {y_size}')
        ax[1].set_title(f'FC: {y_fc} - {percentaje:.2f}%')
        plt.show()



Processing file 100 of 105
Processing file 101 of 105
Processing file 102 of 105
Processing file 103 of 105
Processing file 104 of 105
Processing file 105 of 105


## Dict to arrays

In [None]:
print(AOI_filter)
print('\n')

for key in dict_AOI.keys():
  print(type(dict_AOI[key]))
  dict_AOI[key] = np.array(dict_AOI[key])
  print(type(dict_AOI[key]))


AOI_01


<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'numpy.ndarray'>


In [None]:
dict_AOI.keys()

dict_keys(['AOI_01_X', 'AOI_01_y', 'AOI_01_FC_percentaje', 'AOI_02_X', 'AOI_02_y', 'AOI_02_FC_percentaje', 'AOI_03_X', 'AOI_03_y', 'AOI_03_FC_percentaje', 'AOI_04_X', 'AOI_04_y', 'AOI_04_FC_percentaje'])

In [None]:
dict_AOI[AOI_filter+'_X'].shape

(15120, 7, 256, 256)

In [None]:
dict_AOI[AOI_filter+'_X'][0].shape

(7, 256, 256)

In [None]:
dict_AOI[AOI_filter+'_y'][0].shape

(256, 256)

In [None]:
dict_AOI[AOI_filter+'_y'][0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
dict_AOI[AOI_filter+'_y'][0].sum()

np.float32(11853.0)

In [None]:
dict_AOI[AOI_filter+'_FC_percentaje'][0]

np.float32(18.086243)

# Convert to h5

In [None]:
path_save_h5 = '/content/drive/MyDrive/UIS/Doctorado_UIS2198589/1_semestre/TopicosAvanzadosGeofisica/FC_CarolinaBais/Dataset_h5'

# Save to an HDF5 file
with h5py.File(f"{path_save_h5}/dataset_{AOI_filter}_256x256.h5", "w") as h5f:
  for key, value in dict_AOI.items():
    if AOI_filter in key:
      if isinstance(value, str):  # If it's a string, store it as an attribute
        h5f.attrs[key] = value
      else:  # If it's an array, store it as a dataset
        h5f.create_dataset(key, data=value)

print("HDF5 file saved successfully.")

HDF5 file saved successfully.


In [None]:
os.listdir(path_save_h5)

['dataset_AOI_01_32x32.h5',
 'dataset_AOI_02_32x32.h5',
 'dataset_AOI_03_32x32.h5',
 'dataset_AOI_04_32x32.h5',
 'dataset_AOI_02_64x64.h5',
 'dataset_AOI_01_64x64.h5',
 'dataset_AOI_03_64x64.h5',
 'dataset_AOI_04_64x64.h5',
 'dataset_AOI_04_256x256.h5',
 'dataset_AOI_03_256x256.h5',
 'dataset_AOI_02_256x256.h5',
 'dataset_AOI_01_256x256.h5']

In [None]:
# Listar archivos con su tamaño
for file_ in os.listdir(path_save_h5):
    file_path = os.path.join(path_save_h5, file_)
    if os.path.isfile(file_path):  # Verifica que sea un archivo
        size_bytes = os.path.getsize(file_path)
        size_kb = size_bytes / 1024
        size_mb = size_kb / 1024
        size_gb = size_mb / 1024
        print(f"{file_}: {size_bytes} bytes ({size_kb:.2f} KB, {size_mb:.2f} MB), {size_gb:.2f} GB)")


dataset_AOI_01_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB), 3.08 GB)
dataset_AOI_02_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB), 3.08 GB)
dataset_AOI_03_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB), 3.08 GB)
dataset_AOI_04_32x32.h5: 3307264328 bytes (3229750.32 KB, 3154.05 MB), 3.08 GB)
dataset_AOI_02_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB), 11.54 GB)
dataset_AOI_01_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB), 11.54 GB)
dataset_AOI_03_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB), 11.54 GB)
dataset_AOI_04_64x64.h5: 12387062048 bytes (12096740.28 KB, 11813.22 MB), 11.54 GB)
dataset_AOI_04_256x256.h5: 31709000768 bytes (30965821.06 KB, 30240.06 MB), 29.53 GB)
dataset_AOI_03_256x256.h5: 31709000768 bytes (30965821.06 KB, 30240.06 MB), 29.53 GB)
dataset_AOI_02_256x256.h5: 31709000768 bytes (30965821.06 KB, 30240.06 MB), 29.53 GB)
dataset_AOI_01_256x256.h5: 31709000768 bytes (30965821.06 KB, 30240.06 MB), 29.53 GB)


# End