In [1]:
import os
import subprocess
from tqdm.notebook import tqdm
from typing import Dict, List, Tuple

In [None]:

def get_file_info(file_path: str) -> Tuple[int, int, str]:
    """
    Retrieve information about a file including the number of lines, size on disk, and the header line.
    Args:
        file_path (str): The path to the file.
    Returns:
        Tuple[int, int, str]: A tuple containing:
            - The number of lines in the file (int).
            - The size of the file on disk in bytes (int).
            - The header line of the file (str).
    """
    try:
        # Get number of lines in file with linux command wc (word count)
        result = subprocess.run(['wc', '-l', file_path], stdout=subprocess.PIPE, text=True)
        num_lines = int(result.stdout.split()[0])
    except FileNotFoundError:
        num_lines = -1
    
    # Get header of file
    with open(file_path, 'r') as file:
        header = file.readline().strip()
    
    # Get size of file on disk
    size_on_disk = os.path.getsize(file_path)
    
    return num_lines, size_on_disk, header

def parse_folder(folder_path: str) -> Dict[str, List[Dict[str, str]]]:
    """
    Parses a given folder and collects information about files in its subfolders.
    Args:
        folder_path (str): The path to the folder to be parsed.
    Returns:
        Dict[str, List[Dict[str, str]]]: A dictionary where each key is a subfolder name and the value is a list of dictionaries,
                                         each containing information about a file in that subfolder. The file information includes:
                                         - 'file_name': The name of the file.
                                         - 'num_lines': The number of lines in the file.
                                         - 'size_on_disk': The size of the file on disk.
                                         - 'header': The header of the file.
    """
    folder_info: Dict[str, List[Dict[str, str]]] = {}
    
    for root, dirs, files in os.walk(folder_path):
        for subfolder in tqdm(dirs, desc="Processing subfolders", leave=False):
            subfolder_path = os.path.join(root, subfolder)
            folder_info[subfolder] = []
            for file_name in (pbar := tqdm(os.listdir(subfolder_path), desc=f"Processing files in {subfolder}", leave=False)):
                if file_name == '.gitkeep':
                    continue
                
                pbar.set_postfix_str(f"file: '{file_name}'", refresh=True)
                
                file_path = os.path.join(subfolder_path, file_name)
                if os.path.isfile(file_path):
                    num_lines, size_on_disk, header = get_file_info(file_path)
                    folder_info[subfolder].append({
                        'file_name': file_name,
                        'num_lines': num_lines,
                        'size_on_disk': size_on_disk,
                        'header': header
                    })
    return folder_info

def print_file_info(folder_info: Dict[str, List[Dict[str, str]]], folder_path: str, print_header: bool = False) -> None:
    """
    Prints information about files in a given folder structure.

    Args:
        folder_info (Dict[str, List[Dict[str, str]]]): A dictionary where keys are subfolder names and values are lists of dictionaries containing file information.
            Each file information dictionary should have the following keys:
                - 'file_name': The name of the file (str).
                - 'num_lines': The number of lines in the file (str).
                - 'size_on_disk': The size of the file on disk in bytes (str).
                - 'header': The header of the file (str).
        folder_path (str): The path to the main folder containing the subfolders.
        print_header (bool, optional): If True, prints the header of each file. Defaults to False.

    Returns:
        None
    """
    for subfolder, files in folder_info.items():
        if files:  # Only print subfolders that contain files
            subfolder_path = os.path.join(folder_path, subfolder)
            print(f"Path: {subfolder_path}")
            for file_info in files:
                num_lines_formatted = f"{file_info['num_lines']:,}"
                size_in_mb = file_info['size_on_disk'] / (1024 * 1024)
                size_in_mb_formatted = f"{size_in_mb:,.2f} MB"
                print(f"  File: {file_info['file_name']}")
                print(f"    Lines: {num_lines_formatted} ({size_in_mb_formatted})")
                if print_header:
                    print(f"    Header: {file_info['header']}")

In [3]:
# Collect data
folder_path = "../data"
folder_info = parse_folder(folder_path)

Processing subfolders:   0%|          | 0/3 [00:00<?, ?it/s]

Processing files in interim:   0%|          | 0/2 [00:00<?, ?it/s]

Processing files in processed:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in raw:   0%|          | 0/4 [00:00<?, ?it/s]

Processing subfolders: 0it [00:00, ?it/s]

Processing subfolders: 0it [00:00, ?it/s]

Processing subfolders:   0%|          | 0/3 [00:00<?, ?it/s]

Processing files in covid19-dataset:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in data.cdc.gov:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in arashnic_covid19-case-surveillance-public-use-dataset:   0%|          | 0/2 [00:00<?, ?it/…

Processing subfolders: 0it [00:00, ?it/s]

Processing subfolders: 0it [00:00, ?it/s]

Processing subfolders:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in Covid-19_Ver2:   0%|          | 0/1 [00:00<?, ?it/s]

Processing subfolders: 0it [00:00, ?it/s]

In [6]:
print_file_info(folder_info, folder_path, print_header=False)

Path: ../data/interim
  File: covid-data-clean.csv
    Lines: 1,048,576 (108.18 MB)
Path: ../data/covid19-dataset
  File: Covid Data.csv
    Lines: 1,048,576 (55.74 MB)
Path: ../data/data.cdc.gov
  File: COVID-19_Case_Surveillance_Public_Use_Data_with_Geography_20241023.csv
    Lines: 106,219,501 (14,344.08 MB)
Path: ../data/arashnic_covid19-case-surveillance-public-use-dataset
  File: COVID-19_Case_Surveillance_Public_Use_Data.csv
    Lines: 8,405,080 (872.05 MB)
Path: ../data/Covid-19_Ver2
  File: Covid-19_Ver2
    Lines: 8,405,080 (1,671.83 MB)
