In [None]:
import os
import csv
import zipfile

#
# Examples from Google's AI Overview
#

# Example for a csv file
#with open('large_file.csv', 'r') as file:
#    csv_reader = csv.reader(file)
#    for row in csv_reader:
#        # Process each row as it is read
#        print(row)

# Example for csv file inside a zip file
def stream_csv_from_zip(zip_file_path, csv_file_name):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        with zip_file.open(csv_file_name, 'r') as csv_file:
            csv_reader = csv.reader(csv_file.read().decode('utf-8').splitlines())
            header = next(csv_reader)
            yield header
            for row in csv_reader:
                yield row


In [None]:
my_zip_file = '../../data/2013_vast_challenge/orig_files/VAST2013MC3_NetworkFlow.zip'
with zipfile.ZipFile(my_zip_file, 'r') as zip_ref:
    for file in zip_ref.namelist():
        print(file)
my_csv_file = 'nf/nf-chunk1.csv'

In [None]:
if my_csv_file is not None:
    row_counts = 0
    for row in stream_csv_from_zip(my_zip_file, my_csv_file):
        if row_counts <= 1: print(row) # print header and first row
        row_counts += 1
    print(row_counts)

In [None]:
import json
def processJSON(json_file, zip_ref):
    with zip_ref.open(json_file, 'r') as f:
        json_data = json.loads(f.read())
    print(f'json = {json_data}')

def processCSV(csv_file, zip_ref):
    with zip_ref.open(csv_file, 'r') as f:
        csv_reader = csv.reader(f.read().decode('utf-8').splitlines())
        header = next(csv_reader)
        for row in csv_reader:
            print(row)

def processZipStream(zip_file, parent_zip=None, indent=0):
    if parent_zip is None:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            for file in zip_ref.namelist():
                print(f'**\n** {file}\n**')
                if   file.endswith('.json'): processJSON     (file, zip_ref)
                elif file.endswith('.csv'):  processCSV      (file, zip_ref)
                elif file.endswith('.zip'):  processZipStream(file, zip_ref, indent+1)
    else:
        with parent_zip.open(zip_file, 'r') as nested_zip_file:
            with zipfile.ZipFile(nested_zip_file, 'r') as inner_zip:
                for inner_file_info in inner_zip.infolist():
                    print(' ' * indent + inner_file_info.filename)
                    if   inner_file_info.filename.endswith('.json'): processJSON     (inner_file_info.filename, inner_zip)
                    elif inner_file_info.filename.endswith('.csv'):  processCSV      (inner_file_info.filename, inner_zip)
                    elif inner_file_info.filename.endswith('.zip'):  processZipStream(inner_file_info.filename, inner_zip, indent+1)

processZipStream('../../tmp/fourth.zip')