In [1]:
# Install dependencies
!pip install pandas sk-video

import os
import pandas as pd
import skvideo.io
from PIL import Image
import zipfile
import scipy.io



In [3]:
# Define reusable functions
def extract_zip_files(zip_files, extraction_folder):
    """
    Extract multiple ZIP files to a designated folder.
    """
    os.makedirs(extraction_folder, exist_ok=True)
    for zip_file in zip_files:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            print(f"Extracting {zip_file}...")
            zip_ref.extractall(extraction_folder)
    print(f"All files extracted to: {extraction_folder}")


def process_csv_files(folder_path):
    """
    Process all CSV files in the given folder.
    """
    csv_data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                print(f"Processing CSV file: {file_path}")
                data = pd.read_csv(file_path, delimiter=';')
                csv_data.append((file, data))
    return csv_data


def process_image_files(folder_path):
    """
    Process and prepare all valid image files in the given folder for further use.
    """
    image_data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(root, file)
                try:
                    print(f"Processing image: {file_path}")
                    img = Image.open(file_path)
                    image_data.append((file, img))
                except Exception as e:
                    print(f"Skipping invalid image file: {file_path}. Error: {e}")
    return image_data


def process_video_files(folder_path):
    """
    Process all video files in the given folder and extract metadata.
    """
    video_metadata = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(('.mp4', '.avi')):
                file_path = os.path.join(root, file)
                print(f"Processing video: {file_path}")
                metadata = skvideo.io.ffprobe(file_path)
                video_metadata.append((file, metadata))
    return video_metadata


def process_mat_files(folder_path):
    """
    Process and extract keys from MATLAB `.mat` files in the given folder.
    """
    mat_data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.mat'):
                file_path = os.path.join(root, file)
                print(f"Processing MAT file: {file_path}")
                mat_file_data = scipy.io.loadmat(file_path)
                mat_data.append((file, mat_file_data))
    return mat_data


# Define main processing pipeline
def main():
    # Paths for folders and ZIP files
    zip_files = [
        r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\ShanghaiTech_Crowd_Counting_Dataset.zip",
        r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\archive (2).zip",
        r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\archive (3).zip",
        r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\released-dataset (1).zip"
    ]
    folders = [
        r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\crowd-11",
        r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\ChinaVis 2019 Data Challenge-ICMTD Dataset"
    ]
    extraction_folder = r"C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\extracted_files"

    # Step 1: Extract all ZIP files
    extract_zip_files(zip_files, extraction_folder)

    # Step 2: Process extracted files
    print("Processing extracted files...")
    csv_data = process_csv_files(extraction_folder)
    image_data = process_image_files(extraction_folder)
    video_metadata = process_video_files(extraction_folder)
    mat_data = process_mat_files(extraction_folder)

    # Step 3: Process additional folders
    for folder in folders:
        print(f"Processing folder: {folder}...")
        csv_data.extend(process_csv_files(folder))
        image_data.extend(process_image_files(folder))
        video_metadata.extend(process_video_files(folder))
        mat_data.extend(process_mat_files(folder))

    # Return processed data for further use
    return {
        "csv_data": csv_data,
        "image_data": image_data,
        "video_metadata": video_metadata,
        "mat_data": mat_data
}


# Run the main pipeline
if __name__ == "__main__":
    processed_data = main()

    # Example: Accessing processed data
    print(f"Processed {len(processed_data['csv_data'])} CSV files.")
    print(f"Processed {len(processed_data['image_data'])} image files.")
    print(f"Processed {len(processed_data['video_metadata'])} video files.")
    print(f"Processed {len(processed_data['mat_data'])} MATLAB files.")

Extracting C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\ShanghaiTech_Crowd_Counting_Dataset.zip...
Extracting C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\archive (2).zip...
Extracting C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\archive (3).zip...
Extracting C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\released-dataset (1).zip...
All files extracted to: C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\extracted_files
Processing extracted files...
Processing CSV file: C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\extracted_files\crowds_counting.csv
Processing CSV file: C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\extracted_files\data.csv
Processing image: C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\extracted_files\images\0-1000\0.jpg
Processing image: C:\Users\ryanc\OneDrive\Desktop\Managing Crowds at Large evemnts\extracted_files\i