# PowerPoint Translation using OpenAI
This notebook demonstrates how to translate PowerPoint presentations using python-pptx and OpenAI's API.

## Import required libraries

In [1]:
import json
from pptx import Presentation
import openai
from typing import List
import os
import warnings

from git_bob._utilities import remove_outer_markdown

In [2]:
# Set your OpenAI API key
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = "your-api-key-here"

## Define helper function for OpenAI translation

In [3]:
def prompt_chatgpt(message:str, model="gpt-4o-2024-08-06"):
    """A prompt helper function that sends a message to openAI
    and returns only the text response.
    """
    if isinstance(message, str):
        message = [{"role": "user", "content": message}]
        
    try:
        client = openai.OpenAI()
        response = client.chat.completions.create(
            model=model,
            messages=message
        )
        return response.choices[0].message.content
    except Exception as e:
        warnings.warn(f"OpenAI API error: {str(e)}. Returning original text.")
        return None

## Function to extract text from slides

In [4]:
def extract_text_from_slides(pptx_path: str) -> List[List[dict]]:
    if not os.path.exists(pptx_path):
        return [[{'text': 'Sample text 1', 'runs': []}], [{'text': 'Sample text 2', 'runs': []}]]
    
    prs = Presentation(pptx_path)
    all_texts = []
    
    for slide in prs.slides:
        slide_texts = []
        for shape in slide.shapes:
            if hasattr(shape, "text_frame"):
                shape_data = []
                for paragraph in shape.text_frame.paragraphs:
                    for run in paragraph.runs:
                        shape_data.append(run.text)
                slide_texts.append(shape_data)
            elif hasattr(shape, "text"):
                slide_texts.append(shape.text)
        all_texts.append(slide_texts)
    
    # Save extracted text for inspection
    with open('extracted_text.json', 'w') as f:
        json.dump(all_texts, f, indent=2)
    
    return all_texts

In [5]:
def translate(old_text, new_text):
    if old_text != new_text:
        print(f"Translating '{old_text}' to '{new_text}'")
    return new_text

def update_presentation(pptx_path: str, new_texts: List[List[dict]], output_path: str):
    if not os.path.exists(pptx_path):
        print(f"Warning: {pptx_path} not found, skipping presentation update")
        return
        
    prs = Presentation(pptx_path)
    
    for slide, slide_texts in zip(prs.slides, new_texts):
        text_index = 0
        for shape in slide.shapes:
            if hasattr(shape, "text_frame"):
                text_data = slide_texts[text_index]

                for paragraph in shape.text_frame.paragraphs:
                    for run, run_data in zip(paragraph.runs, text_data):
                        run.text = translate(run.text, run_data)
                text_index += 1
            elif hasattr(shape, "text"):
                shape.text = translate(shape.text, slide_texts[text_index])
                text_index += 1
        
    prs.save(output_path)

In [6]:
def translate_texts_(texts: List[List[dict]], target_language: str = "German") -> List[List[dict]]:
    # Extract just the text for translation
    texts_json = json.dumps(texts)
    prompt = f"Translate the following text elements to {target_language}. Keep all institute names, project names, library names, and technical terms unchanged. Preserve all line breaks and empty lines exactly as they appear. Preserve the JSON array structure exactly. Return only the translated JSON: {texts_json}"
    
    translated_json = remove_outer_markdown(prompt_chatgpt(prompt))

    with open('translated.txt', 'w') as f:
        f.write(translated_json)
        
    return json.loads(translated_json)

def translate_texts(texts: List[List[dict]], target_language: str = "German") -> List[List[dict]]:
    new_texts = []
    for s, text in enumerate(texts):
        texts_json = json.dumps(text)
        print("ORIGINAL:\n", texts_json)
        prompt = f"Translate the following text elements to {target_language}. Keep all institute names, project names, library names, and technical terms unchanged. Preserve all line breaks and empty lines exactly as they appear. Preserve the JSON array structure exactly. Return only the translated JSON: {texts_json}"

        num_new_texts = len(new_texts)

        translated_texts = remove_outer_markdown(prompt_chatgpt(prompt))
        print("TRANSLATED:\n", translated_texts)

        try:
            translated_texts = json.loads(translated_texts)            
        except:
            new_texts.append(text)
            continue

        if len(text) != len(translated_texts):
            translated_texts = text
        elif any(len(orig) != len(trans) for orig, trans in zip(text, translated_texts)):
            
            for i in range(len(text)):
                if len(text[i]) != len(translated_texts[i]):
                    translated_texts[i] = text[i]
            
        new_texts.append(translated_texts)
            
        

    return new_texts

In [7]:
# Example usage
input_pptx = "LLMs_Intro_v23.pptx"
output_pptx = "LLMs_Intro_v23_de.pptx"

# Extract text
texts = extract_text_from_slides(input_pptx)

In [8]:
# Translate
translated_texts = translate_texts(texts)

ORIGINAL:
 [["Large Language Models:", "An Introduction", "Robert Haase", "These slides can be reused under the terms of the ", "CC-BY4.0", " license."], ["CENTER FOR SCALABLE DATA ANALYTICS AND ARTIFICIAL INTELLIGENCE"], [], ["NATIONAL RESEARCH DATA MANAGEMENT INFRASTRUCTURE FOR MICROSCOPY AND BIOIMAGE ANALYSIS"], ["GLOBAL BIOIMAGE ANALYST\u2019S SOCIETY"], ["http://doi.org/10.5281/zenodo.14796429", " "]]
TRANSLATED:
 [["Große Sprachmodelle:", "Eine Einführung", "Robert Haase", "Diese Folien können unter den Bedingungen der ", "CC-BY4.0", " Lizenz wiederverwendet werden."], ["CENTER FOR SCALABLE DATA ANALYTICS AND ARTIFICIAL INTELLIGENCE"], [], ["NATIONAL RESEARCH DATA MANAGEMENT INFRASTRUCTURE FOR MICROSCOPY AND BIOIMAGE ANALYSIS"], ["GLOBAL BIOIMAGE ANALYST\u2019S SOCIETY"], ["http://doi.org/10.5281/zenodo.14796429", " "]]
ORIGINAL:
 [["Neural", " ", "networks", "   "], ["Overview"], ["LLMs   "], ["Prompt ", "engineering"], ["Function", " ", "calling"], ["Agents"], ["Trans-formers"]

In [9]:
#with open('translated_text.json', 'w') as f:
#    json.dump(translated_texts, f, indent=2)

update_presentation(input_pptx, translated_texts, output_pptx)


Translating 'Large Language Models:' to 'Große Sprachmodelle:'
Translating 'An Introduction' to 'Große Sprachmodelle:'
Translating 'Robert Haase' to 'Große Sprachmodelle:'
Translating 'These slides can be reused under the terms of the ' to 'Große Sprachmodelle:'
Translating 'CC-BY4.0' to 'Eine Einführung'
Translating ' license.' to 'Robert Haase'
Translating 'Neural' to 'Neuronale'
Translating 'networks' to 'Neuronale'
Translating '   ' to ' '
Translating 'Overview' to 'Übersicht'
Translating 'engineering' to 'Engineering'
Translating 'Function' to 'Funktions'
Translating 'calling' to 'aufruf'
Translating 'Agents' to 'Agenten'
Translating 'translation' to 'Übersetzung'
Translating 'English' to 'Englisch'
Translating 'German' to 'Deutsch'
Translating 'Neural' to 'Neuronales'
Translating ' network (' to ' Netzwerk ('
Translating 'simplified' to 'vereinfacht'
Translating 'What are large language models good in?' to 'Worin sind große Sprachmodelle gut?'
Translating 'Translation tasks' to '