In [32]:
from pptx import Presentation
import json

import os

FOLDER = '/Users/sahuguet/Downloads/PPTX'
pptx_files = [os.path.join(FOLDER, file) for file in os.listdir(FOLDER) if file.endswith('.pptx')]

In [49]:

def process_pptx_file(f):
    print(f'Processing {f} ...')
    # Creating folder
    folder_name = f'{f}_d'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f'Folder created: {folder_name}')

    prs = Presentation(f)

    core_properties = prs.core_properties
    metadata = {
        'author': core_properties.author,
        'category': core_properties.category,
        'comments': core_properties.comments,
        'content_status': core_properties.content_status,
        'created': str(core_properties.created),
        'identifier': core_properties.identifier,
        'keywords': core_properties.keywords,
        'language': core_properties.language,
        'last_modified_by': str(core_properties.last_modified_by),
        'last_printed': str(core_properties.last_printed),
        'modified': str(core_properties.modified),
        'revision': str(core_properties.revision),
        'subject': core_properties.subject,
        'title': core_properties.title,
        'version': core_properties.version
    }
    print(metadata)
    
    with open(os.path.join(folder_name, 'metadata.json'), 'w') as file:
        json.dump(metadata, file, indent=True)

    # We process all text content into `content.txt`.
    all_text = []
    image_counter = 0
    for index, slide in enumerate(prs.slides):
        all_text.append(f'\n=== Slide {str(index+1).zfill(3)} ===')
        for shape in slide.shapes:
            if shape.shape_type == 13:  # shape_type 13 corresponds to picture
                image_counter += 1
                image = shape.image
                image_bytes = image.blob
                image_ext = image.ext
                image_filename = f'image{str(image_counter).zfill(3)}.{image_ext}'
                image_path = os.path.join(folder_name, image_filename)
                
                # Save the image
                with open(image_path, 'wb') as img_file:
                    img_file.write(image_bytes)
                continue

            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    all_text.append(run.text)
    with open(os.path.join(folder_name, 'content.txt'), 'w') as file:
        file.write('\n'.join(all_text))


In [51]:
for file in pptx_files:
    process_pptx_file(file)

Processing /Users/sahuguet/Downloads/PPTX/Untitled presentation.pptx ...
{'author': '', 'category': '', 'comments': '', 'content_status': '', 'created': 'None', 'identifier': '', 'keywords': '', 'language': '', 'last_modified_by': 'python-pptx', 'last_printed': 'None', 'modified': '2024-04-08 03:40:52', 'revision': '1', 'subject': '', 'title': 'PowerPoint Presentation', 'version': ''}
Processing /Users/sahuguet/Downloads/PPTX/Child Not Found 404.pptx ...
{'author': '', 'category': '', 'comments': '', 'content_status': '', 'created': 'None', 'identifier': '', 'keywords': '', 'language': '', 'last_modified_by': 'python-pptx', 'last_printed': 'None', 'modified': '2024-04-08 03:40:52', 'revision': '1', 'subject': '', 'title': 'PowerPoint Presentation', 'version': ''}


{'author': '', 'category': '', 'comments': '', 'content_status': '', 'created': 'None', 'identifier': '', 'keywords': '', 'language': '', 'last_modified_by': 'python-pptx', 'last_printed': 'None', 'modified': '2024-04-08 03:41:07', 'revision': '1', 'subject': '', 'title': 'PowerPoint Presentation', 'version': ''}
Processing /Users/sahuguet/Downloads/PPTX/Affordable Housing Tech.pptx ...
{'author': '', 'category': '', 'comments': '', 'content_status': '', 'created': 'None', 'identifier': '', 'keywords': '', 'language': '', 'last_modified_by': 'python-pptx', 'last_printed': 'None', 'modified': '2024-04-08 03:41:08', 'revision': '1', 'subject': '', 'title': 'PowerPoint Presentation', 'version': ''}
