In [1]:
import zipfile
import multiprocessing as mp
import tqdm
import os

import pandas as pd
import matplotlib.pyplot as plt
import h5py
import cv2
import numpy as np 


## Extract data from zipfile

In [2]:
import zipfile
import os

def extract_zip(zip_file_path: str, extract_to_directory: str):
    """
    Extracts a ZIP file to a specified directory if the files do not already exist.

    Parameters:
        zip_file_path (str): Path to the ZIP file.
        extract_to_directory (str): Directory where the ZIP file will be extracted.
    """
    # Check if extraction directory already contains files from the ZIP
    extracted_files = [os.path.join(extract_to_directory, name) for name in zipfile.ZipFile(zip_file_path, 'r').namelist()]
    files_already_exist = all(os.path.exists(file) for file in extracted_files)

    if not files_already_exist:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to_directory)
        print(f"Extracted {zip_file_path} to {extract_to_directory}")
    else:
        print(f"Files already extracted to {extract_to_directory}. Skipping extraction.")

def remove_file(file_path: str):
    """
    Removes a file from the filesystem.

    Parameters:
        file_path (str): Path to the file to be removed.
    """
    os.remove(file_path)
    print(f"Removed {file_path}")

# Paths
zip_file_path = "../data/raw/train-metadata.zip"
extract_to_directory = "../data/raw/"

# Extract the ZIP file if needed
extract_zip(zip_file_path, extract_to_directory)

# Optionally, remove the ZIP file after extraction
remove_file(zip_file_path)


FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/train-metadata.zip'

## Load Metadata

In [None]:
df_metadata = pd.read_csv("../data/raw/train-metadata.csv")
df_metadata.head(10)

In [None]:
def df_stats(df: pd.DataFrame, include_all: bool = False):
    """
    Print statistics and null value counts for a pandas DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        include_all (bool): If True, include all columns in the descriptive statistics; otherwise, include only numeric columns.

    Returns:
        None
    """
    if df.empty:
        print("The DataFrame is empty.")
        return

    # Print descriptive statistics
    print("Descriptive Statistics:")
    if include_all:
        print(df.describe(include='all'))
    else:
        print(df.describe(include=[np.number]))
    print("\n" + "-"*50 + "\n")  # Separator for clarity

    # Print the number of null values per column
    print("Null Value Counts:")
    print(df.isnull().sum())
    print("\n" + "-"*50 + "\n")  # Separator for clarity

    # Additional information: Percentage of null values per column
    print("Percentage of Null Values:")
    print(df.isnull().mean() * 100)
    print("\n" + "-"*50 + "\n")  # Separator for clarity

    # Number of rows and columns
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print("\n" + "-"*50 + "\n")  # Separator for clarity

In [None]:
df_stats(df_metadata)

 ## Load Image Byte String

In [None]:
import h5py

def load_image_from_hdf5(isic_id: str,
                         file_path: str = "../data/raw/train-image.hdf5",
                         n_channels: int = 3):
    # Handle the case where the isic_id is passed incorrectly
    if not isic_id.lower().startswith("isic"):
        isic_id = f"ISIC_{int(str(isic_id).split('_', 1)[-1]):>07}"
        
    # Open the HDF5 file in read mode
    with h5py.File(file_path, 'r') as hf:
        
        # Retrieve the image data from the HDF5 dataset using the provided ISIC ID
        try:
            image_data = hf[isic_id][()]
        except KeyError:
            raise KeyError(f"ISIC ID {isic_id} not found in HDF5 file.")

        # Convert the binary data to a numpy array
        image_array = np.frombuffer(image_data, np.uint8)

        # Decode the image from the numpy array
        if n_channels == 3:
            # Load the image as a color image (BGR) and convert to RGB
            image = cv2.cvtColor(cv2.imdecode(image_array, cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
        else:
            # Load the image as a grayscale image
            image = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE)

        # If the image failed to load for some reason (problems decoding) ...
        if image is None:
            raise ValueError(f"Could not decode image for ISIC ID: {isic_id}")
        
        return image


In [None]:
plt.figure(figsize=(6,6))
plt.title("ISIC_0015670", fontweight="bold")
plt.imshow(load_image_from_hdf5("ISIC_0015670"))
plt.show()