<a href="https://colab.research.google.com/github/stenoe/BDOA/blob/main/Notebooks/BDOA_notebook_Lecture10_HDF5_Data_Format.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## HDF5 with Python

In this lesson we will learn to:

1. Create, Save and Read HDF5 Files
2. Create and Read Groups and Subgroups in HDF5 Files
3. Data Compression in HDF5 Files
4. How to Set and Read Attributes
5. How to Create and read HDF5 Files using Pandas


### How to Create and Save HDF5 Files

In [None]:
import numpy as np
import h5py
import os

In [None]:
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (10000,100))

In [None]:
with h5py.File('hdf5_data.h5','w') as hdf:
    hdf.create_dataset('dataset1', data=matrix1)
    hdf.create_dataset('dataset2', data=matrix2)    

In [None]:
# get file size in python
file_name = "hdf5_data.h5"

file_stats = os.stat(file_name)

print(file_stats)
#print(f'File Size in Bytes is {file_stats.st_size}')
#print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024)}')
print(f'File Size in MegaBytes is {file_stats.st_size / (1000000)}')

os.stat_result(st_mode=33188, st_ino=4849745, st_dev=47, st_nlink=1, st_uid=0, st_gid=0, st_size=16002048, st_atime=1679824371, st_mtime=1679824371, st_ctime=1679824371)
File Size in MegaBytes is 16.002048


### How to Read HDF5 Files

In [None]:
with h5py.File('hdf5_data.h5', 'r') as hdf:
    ls = list(hdf.keys())
    print('List of datasets in this file: \n', ls)
    data = hdf.get('dataset1')
    dataset1 = np.array(data)
    print('Shape of dataset1: \n', dataset1.shape)

List of datasets in this file: 
 ['dataset1', 'dataset2']
Shape of dataset1: 
 (1000, 1000)


In [None]:
dataset1

array([[0.7419199 , 0.98390043, 0.97278465, ..., 0.42512917, 0.51185969,
        0.05326737],
       [0.27741415, 0.83188397, 0.87653408, ..., 0.30416553, 0.11332442,
        0.61557232],
       [0.62510573, 0.40796712, 0.98848012, ..., 0.3125824 , 0.77586073,
        0.3732902 ],
       ...,
       [0.94960031, 0.56842416, 0.00994517, ..., 0.87602408, 0.72702729,
        0.60330952],
       [0.95238931, 0.21812644, 0.16540855, ..., 0.25717691, 0.06620064,
        0.99601184],
       [0.99778476, 0.03742449, 0.01505733, ..., 0.1536549 , 0.36698687,
        0.82162643]])

In [None]:
f = h5py.File('hdf5_data.h5','r')
ls = list(f.keys())
f.close()

In [None]:
ls

['dataset1', 'dataset2']

### How to Create Groups and Subgroups in HDF5 Files

In [None]:
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (1000,1000))
matrix3 = np.random.random(size = (1000,1000))
matrix4 = np.random.random(size = (1000,1000))

In [None]:
with h5py.File('hdf5_data_groups.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1)
    G1.create_dataset('dataset4', data = matrix4)    

    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3)

    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2)       

In [None]:
matrix4

array([[0.69459015, 0.77989308, 0.20010063, ..., 0.07965557, 0.22458386,
        0.41731707],
       [0.01529281, 0.38522045, 0.47311903, ..., 0.25826161, 0.72932435,
        0.34225346],
       [0.40110928, 0.19024882, 0.28945091, ..., 0.98931425, 0.02295069,
        0.65087653],
       ...,
       [0.97586871, 0.87121515, 0.61825057, ..., 0.44213861, 0.47217253,
        0.49580442],
       [0.0466027 , 0.729239  , 0.71909629, ..., 0.89688126, 0.80847145,
        0.24989373],
       [0.37651964, 0.973857  , 0.98352964, ..., 0.2375898 , 0.62141366,
        0.31876375]])

### How to Read Groups and Subgroups from HDF5 Files

In [None]:
with h5py.File('hdf5_data_groups.h5', 'r') as hdf:
    base_items = list(hdf.items())
    print('Items in the base directory:', base_items)
    G1 = hdf.get('Group1')
    G1_items = list(G1.items())
    print('Items in Group1:', G1_items)   
    dataset4 = np.array(G1.get('dataset4'))
    print(dataset4.shape)
    

Items in the base directory: [('Group1', <HDF5 group "/Group1" (2 members)>), ('Group2', <HDF5 group "/Group2" (2 members)>)]
Items in Group1: [('dataset1', <HDF5 dataset "dataset1": shape (1000, 1000), type "<f8">), ('dataset4', <HDF5 dataset "dataset4": shape (1000, 1000), type "<f8">)]
(1000, 1000)


In [None]:
with h5py.File('hdf5_data_groups.h5', 'r') as hdf:
    base_items = list(hdf.items())
    print('Items in the base directory:', base_items)
    G2 = hdf.get('Group2')
    G2_items = list(G2.items())
    print('Items in Group2:', G2_items)   
    G21 = G2.get('/Group2/SubGroup1')
    G21_items = list(G21.items())
    print('Items in Group21:', G21_items)       
    dataset3 = np.array(G21.get('dataset3'))
    print(dataset3.shape)

Items in the base directory: [('Group1', <HDF5 group "/Group1" (2 members)>), ('Group2', <HDF5 group "/Group2" (2 members)>)]
Items in Group2: [('SubGroup1', <HDF5 group "/Group2/SubGroup1" (1 members)>), ('SubGroup2', <HDF5 group "/Group2/SubGroup2" (1 members)>)]
Items in Group21: [('dataset3', <HDF5 dataset "dataset3": shape (1000, 1000), type "<f8">)]
(1000, 1000)


### Data Compression in HDF5 Files

In [None]:
with h5py.File('hdf5_data_groups_compressed.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1, compression="gzip",compression_opts=9)
    G1.create_dataset('dataset4', data = matrix4, compression="gzip",compression_opts=9)    

    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3, compression="gzip",compression_opts=9)

    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2, compression="gzip",compression_opts=9)       

In [None]:
# Look at more information on how to compress it
# help(hdf.create_dataset)

In [None]:
# Compare the sizes
file_name1 = "hdf5_data_groups.h5"
file_stats1 = os.stat(file_name1)
print(f'File Size in MegaBytes is {file_stats1.st_size / (1000000)}')

file_name2 = "hdf5_data_groups_compressed.h5"
file_stats2 = os.stat(file_name2)
print(f'File Size in MegaBytes is {file_stats2.st_size / (1000000)}')

File Size in MegaBytes is 32.006528
File Size in MegaBytes is 30.265102


### How to Set and Read Attributes

In [None]:
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (10000,100))

In [None]:
# Create the HDF5 file
hdf = h5py.File('test.h5', 'w')

# Create the datasets
dataset1 = hdf.create_dataset('dataset1', data = matrix1)
dataset2 = hdf.create_dataset('dataset2', data = matrix2)

# Set attributes 
dataset1.attrs['CLASS'] = 'DATA MATRIX'
dataset1.attrs['VERSION'] = '1.1'

hdf.close()

In [None]:
# Read the HDF5 file
hdf = h5py.File('test.h5', 'r')
ls = list(hdf.keys())
print('List of datasets in this file: \n', ls)
data = hdf.get('dataset1')
dataset1 = np.array(data)
print('Shape of dataset1: \n', dataset1.shape)
# Read the atributes
k = list(data.attrs.keys())
v = list(data.attrs.values())
print(k[0])
print(v[0])
print(data.attrs[k[0]])

hdf.close()


List of datasets in this file: 
 ['dataset1', 'dataset2']
Shape of dataset1: 
 (1000, 1000)
CLASS
DATA MATRIX
DATA MATRIX


### How to Create HDF5 Files using Pandas

In [None]:
import pandas as pd

In [None]:
# Creates (or open in append mode) and hdf5 file
hdf = pd.HDFStore('hdf5_pandas.h5')

# Size
file_name1 = "hdf5_pandas.h5"
file_stats1 = os.stat(file_name1)
print(f'File Size in KiloBytes is {file_stats1.st_size / (1000)}')

File Size in KiloBytes is 669.912


#### Look at the sample_data folder in the left

* There is a file called: `california_housing_test.csv`

* Open it and visualize

In [None]:
# read it with pandas
df1 = pd.read_csv('sample_data/california_housing_test.csv')
#df1

In [None]:
# it is possible to store it in hdf
#help(hdf.put)

In [None]:
hdf.put("DF1", df1, format='table', data_columns=True)

In [None]:
print(f'File Size in Bytes is {file_stats1.st_size}')

File Size in Bytes is 96


In [None]:
data = {
    "city": ["Tripoli", "Sydney", "Rome"],
    "rank": ["1st", "2nd", "3rd"],
    "score": [24,15,62]
}

In [None]:
type(data)

dict

In [None]:
df2 = pd.DataFrame(data, columns= ["city", "rank", "score"])
df2

Unnamed: 0,city,rank,score
0,Tripoli,1st,24
1,Sydney,2nd,15
2,Rome,3rd,62


In [None]:
hdf.put("DF2key", df2, format='table', data_columns=True)
hdf.close()

In [None]:
print(f'File Size in Bytes is {file_stats1.st_size}')

File Size in Bytes is 96


In [None]:
import shutil

original = r'hdf5_pandas.h5'
target = r'hdf5_pandas_copy.h5'

shutil.copyfile(original, target)

'hdf5_pandas_copy.h5'

### Read the pandas HDF 

In [None]:
hdf_new = pd.HDFStore('hdf5_pandas_copy.h5', mode='r')

In [None]:
hdf_new.keys()

['/DF1', '/DF2', '/DF2key']

In [None]:
df1_n = hdf_new.get('/DF1')

In [None]:
df1_n.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


## Open your HDF5 file in a web browser visualizer

https://myhdf5.hdfgroup.org/


## Homework Activity

1. Access NEON website and read about it

https://www.neonscience.org/


2. Watch the NEON explanation video

https://www.youtube.com/watch?v=39YrzpxVRF8

3. Do the Tutorial (Open HDF5 files with Python Sample Code)

https://www.neonscience.org/resources/learning-hub/tutorials/hdf5-intro-python


### Extra material (course about HDF5)


**Learn how to use HDF5 (HDF Group)**

https://www.youtube.com/playlist?list=PLPyhR4PdEeGYWHRhzmCP5stzfIha8bqVg

**Another short course on Youtube:**

https://www.youtube.com/playlist?list=PLea0WJq13cnB_ORdGzEkPlZEN20TSt6Lx