<a href="https://colab.research.google.com/github/skyemk/Batch-Word-PDF-Creator-With-Data-From-CSV/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
from docx import Document
import subprocess

print("Installing LibreOffice...")
install_command = ['sudo', 'apt-get', 'update', '&&', 'sudo', 'apt-get', 'install', '-y', 'libreoffice-writer', 'libreoffice-calc']

try:
    # Use shell=True for `&&` to work, or run commands separately
    process = subprocess.run("sudo apt-get update && sudo apt-get install -y libreoffice-writer libreoffice-calc", shell=True, capture_output=True, text=True, check=True)
    print("LibreOffice installation output:")
    print(process.stdout)
    if process.stderr: print(f"Installation errors (if any):\n{process.stderr}")
    print("LibreOffice installed successfully.")
except subprocess.CalledProcessError as e:
    print(f"Error installing LibreOffice: {e}")
    print(f"  stdout: {e.stdout}")
    print(f"  stderr: {e.stderr}")
except Exception as e:
    print(f"An unexpected error occurred during LibreOffice installation: {e}")

# Create a sample CSV file if it doesn't exist
file_name = 'data.csv'
if not os.path.exists(file_name):
    sample_data = {
        'NAME': ['Alice', 'Bob', 'Charlie', 'David'],
        'ADDRESS': ['123 Main St', '456 Oak Ave', '789 Pine Ln', '101 Elm Rd'],
        'AMOUNT': [100.50, 200.75, 150.00, 300.25],
        'PDF': [True, True, True, False]
    }
    sample_df = pd.DataFrame(sample_data)
    sample_df.to_csv(file_name, index=False)
    print(f"'{file_name}' created successfully.")
else:
    print(f"'{file_name}' already exists. Skipping creation.")

# Load the data from the CSV file into a pandas DataFrame
df = pd.read_csv(file_name)

# Display the first few rows of the DataFrame
print("First 5 rows of the DataFrame:")
display(df.head())

# Define the path to your template Word document
template_path = 'template.docx'
docx_output_dir = 'output_documents_docx'
pdf_output_dir = 'output_documents_pdf'

# Create a dummy template.docx if it doesn't exist for demonstration purposes
if not os.path.exists(template_path):
    doc = Document()
    doc.add_heading('Personalized Document', level=1)
    doc.add_paragraph('Dear {{NAME}},')
    doc.add_paragraph('This document is personalized for you at {{ADDRESS}}.')
    doc.add_paragraph('The amount due is ${{AMOUNT}}.')
    doc.save(template_path)
    print(f"Created a dummy template: '{template_path}'")
else:
    print(f"Using existing template: '{template_path}'")

# Create output directories if they don't exist
if not os.path.exists(docx_output_dir):
    os.makedirs(docx_output_dir)
    print(f"Created DOCX output directory: '{docx_output_dir}'")

if not os.path.exists(pdf_output_dir):
    os.makedirs(pdf_output_dir)
    print(f"Created PDF output directory: '{pdf_output_dir}'")

def replace_placeholders(doc_obj, data):
    """Replaces placeholders in a document object with data."""
    # Replace placeholders in paragraphs
    for p in doc_obj.paragraphs:
        for key, value in data.items():
            placeholder = '{{' + str(key) + '}}'
            if placeholder in p.text:
                p.text = p.text.replace(placeholder, str(value))

    # Replace placeholders in tables
    for table in doc_obj.tables:
        for row in table.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    for key, value in data.items():
                        placeholder = '{{' + str(key) + '}}'
                        if placeholder in p.text:
                            p.text = p.text.replace(placeholder, str(value))

# Iterate through each row of the DataFrame and generate personalized documents
print("\nGenerating personalized documents...")
for index, row_data in df.iterrows():
    # Load the template document
    doc = Document(template_path)

    # Convert row data to a dictionary
    row_dict = row_data.to_dict()

    # Replace placeholders
    replace_placeholders(doc, row_dict)

    # Determine output format based on 'PDF' column
    if row_dict['PDF']:
        # Generate a temporary .docx, convert to PDF, then delete .docx
        temp_docx_filename = os.path.join(docx_output_dir, f"temp_document_{row_dict['NAME']}.docx")
        pdf_filename = os.path.join(pdf_output_dir, f"personalized_document_{row_dict['NAME']}.pdf")

        doc.save(temp_docx_filename)

        try:
            # LibreOffice command to convert docx to pdf
            # -o for output directory, --convert-to pdf for conversion type
            # Note: LibreOffice must be installed and accessible in the system's PATH
            command = [
                'libreoffice',
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', pdf_output_dir,
                temp_docx_filename
            ]

            result = subprocess.run(command, capture_output=True, text=True, check=True)
            print(f"Generated PDF: '{pdf_filename}'")
            if result.stdout: print(f"  stdout: {result.stdout.strip()}")
            if result.stderr: print(f"  stderr: {result.stderr.strip()}")

            # Delete the temporary .docx file
            os.remove(temp_docx_filename)
            print(f"Deleted temporary DOCX: '{temp_docx_filename}'")

        except FileNotFoundError:
            print(f"Error: LibreOffice command not found. Please ensure LibreOffice is installed and in your system's PATH to convert '{temp_docx_filename}' to PDF.")
        except subprocess.CalledProcessError as e:
            print(f"Error converting '{temp_docx_filename}' to PDF: {e}")
            print(f"  stdout: {e.stdout.strip()}")
            print(f"  stderr: {e.stderr.strip()}")
        except Exception as e:
            print(f"An unexpected error occurred during PDF conversion for '{temp_docx_filename}': {e}")

    else:
        # Save as .docx directly
        output_docx_filename = os.path.join(docx_output_dir, f"personalized_document_{row_dict['NAME']}.docx")
        doc.save(output_docx_filename)
        print(f"Generated DOCX: '{output_docx_filename}'")

print("\nDocument generation complete.")