### Extracting 10-K Files from SEC Filings ZIP Archive

- This script extracts only 10-K files (excluding 10-K-A and other variations) from a ZIP archive containing SEC filings.   
- It specifically targets files inside the subfolders QTR1, QTR2, QTR3, and QTR4 within the 2024 directory, and saves them in a new folder called 2024_10_K.   
- The script ensures efficient extraction without unzipping unnecessary files. 🚀

#### for 2024 zip folder

In [None]:
import zipfile
import os
import re
from pathlib import Path

# Define root path
root_path = r"D:\Users\saayella\Downloads"  # Ensure this path is correct

# Define ZIP filename and construct full input path
zip_file = "10-X_C_2024.zip"  # Ensure the correct filename
input_path = os.path.join(root_path, zip_file)

# Define output directory (no subfolders like 2024/QTRx)
output_folder = "2024_10_K"
output_path = os.path.join(root_path, output_folder)

# Ensure output folder exists
os.makedirs(output_path, exist_ok=True)

# Define the subfolders inside "2024" to target
input_subfolders = ["QTR1", "QTR2", "QTR3", "QTR4"]

# Function to extract only 10-K files **without subfolders** and count them
def extract_10k_files(zip_path, subfolders, output_dir):
    file_count = 0  # Counter for extracted files

    with zipfile.ZipFile(zip_path, 'r') as z:
        for subfolder in subfolders:
            for file in z.namelist():
                # Check if the file is in a QTR folder and is a 10-K (not 10-K-A)
                if re.search(fr"2022/{subfolder}/.*_10-K_", file) and not re.search(r"10-K-[A-Za-z]", file):  # some folders didn't have {year} > QTR1
                    # Get only the filename (remove folder structure)
                    filename = os.path.basename(file)
                    output_file_path = os.path.join(output_dir, filename)

                    # Extract the file without keeping the subfolder structure
                    with z.open(file) as src, open(output_file_path, "wb") as dest:
                        dest.write(src.read())

                    file_count += 1  # Increment counter
                print(f'# of files extracted = {file_count}')
    print(f"✅ Extraction completed. {file_count} files were saved in '{output_path}'.")

# Run extraction
extract_10k_files(input_path, input_subfolders, output_path)


# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 1
# of files extracted = 2
# of files extracted = 3
# of files extracted = 3
# of files extracted = 3
# of files extracted = 3
# of files extracted = 3
# of files extracted = 3
# of files extracted = 3
# of files extracted = 4
# of files extracted = 5
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 6
# of files extracted = 7
# of files extracted = 7
# of files extracted = 7
# of files extracted = 7
# of files extracted = 7
# of files extracted = 7
# of files extracted = 7
# of files extracted = 7


BadZipFile: Bad magic number for file header

### for 2016-2020 folder

In [4]:
import zipfile
import os
import re
from pathlib import Path

# Define root path
root_path = r"D:\Backup from Dell\Edgar\10KQ from Bill McDonald"

# Define ZIP filename (corrected: all years are in the same ZIP file)
zip_file = "10-X_C_2016-2020.zip"  # Ensure the correct ZIP filename
input_path = os.path.join(root_path, zip_file)

# Define the target year to extract data from
year = "2019"

# Define output directory (no subfolders like 2016/QTRx)
output_folder = f"{year}_10_K"
output_path = os.path.join(root_path, output_folder)

# Ensure output folder exists
os.makedirs(output_path, exist_ok=True)

# Define the subfolders inside the selected year
input_subfolders = ["QTR1", "QTR2", "QTR3", "QTR4"]

# Function to extract only 10-K files **without subfolders** and count them
def extract_10k_files(zip_path, year, subfolders, output_dir):
    file_count = 0  # Counter for extracted files

    with zipfile.ZipFile(zip_path, 'r') as z:
        for subfolder in subfolders:
            for file in z.namelist():
                # Check if the file belongs to the correct year & QTR folder and contains only "10-K"
                if re.search(fr"{year}/{subfolder}/.*_10-K_", file) and not re.search(r"10-K-[A-Za-z]", file):
                    # Get only the filename (remove folder structure)
                    filename = os.path.basename(file)
                    output_file_path = os.path.join(output_dir, filename)

                    # Extract the file without keeping the subfolder structure
                    with z.open(file) as src, open(output_file_path, "wb") as dest:
                        dest.write(src.read())

                    file_count += 1  # Increment counter
                    print(f"# of files extracted = {file_count}")

    print(f"✅ Extraction completed. {file_count} files were saved in '{output_path}'.")

# Run extraction for 2016
extract_10k_files(input_path, year, input_subfolders, output_path)


# of files extracted = 1
# of files extracted = 2
# of files extracted = 3
# of files extracted = 4
# of files extracted = 5
# of files extracted = 6
# of files extracted = 7
# of files extracted = 8
# of files extracted = 9
# of files extracted = 10
# of files extracted = 11
# of files extracted = 12
# of files extracted = 13
# of files extracted = 14
# of files extracted = 15
# of files extracted = 16
# of files extracted = 17
# of files extracted = 18
# of files extracted = 19
# of files extracted = 20
# of files extracted = 21
# of files extracted = 22
# of files extracted = 23
# of files extracted = 24
# of files extracted = 25
# of files extracted = 26
# of files extracted = 27
# of files extracted = 28
# of files extracted = 29
# of files extracted = 30
# of files extracted = 31
# of files extracted = 32
# of files extracted = 33
# of files extracted = 34
# of files extracted = 35
# of files extracted = 36
# of files extracted = 37
# of files extracted = 38
# of files extracted 