# Create Word from PDF

### Install Libraries

In [1]:
!pip install pdf2image
!pip install python-docx
!pip install poppler



### Utility Functions

In [20]:
from PIL import Image
from pdf2image import convert_from_path
import glob 
from pathlib import Path
import shutil, os
from docx import Document
import fnmatch
import re


def find_files_ignore_case(which, where='.'):
    '''Returns list of filenames from `where` path matched by 'which'
       shell pattern. Matching is case-insensitive.'''
    
    # TODO: recursive param with walk() filtering
    rule = re.compile(fnmatch.translate(which), re.IGNORECASE)
    return [name for name in os.listdir(where) if rule.match(name)]


def crop_image_center(file, crop_left, crop_right, crop_top, crop_bottom):
    img = Image.open(file)
    x, y = img.size
    box = (crop_left, crop_top, x - crop_left - crop_right, y - crop_top - crop_bottom)
    crop = img.crop(box)    
    crop.save(file)

def create_empty_folder(path):
    '''Create a folder. Delete content if exists'''
    Path(path).mkdir(parents=True, exist_ok=True)
    
    # Remove existing files
    existing_files = find_files_ignore_case(os.path.join(path, '*'))
    for ef in existing_files:
        os.remove(ef)

def convert_pdf_to_images(file):
    '''Convert a PDF file into images and save to folder of same name
        Return folder which contains the images
    '''
    # Create directory for each file
    folder = os.path.splitext(file)[0]    
    create_empty_folder(folder)
    
    # Convert PDF to images into the directory
    images = convert_from_path(file)
    for i, image in enumerate(images):
        file_name = 'Z{:05}.jpg'.format(i+1)
        image.save(os.path.join(folder, file_name), 'JPEG')

    return folder

def get_file_name_prefix(filename):
    with open('file_name_prefixes.txt') as f:
        line = f.readline().strip()
#         print(repr(filename.lower()), repr(line.lower()))
        if filename.lower().startswith(line.lower()):
            return line.strip()
    return None

* Convert PDF files to images. 
* Images from each PDF file are saved in a folder of same name.

Get current folder

In [2]:
cur_folder = os.path.abspath('')

### Convert PDFs to Images

In [3]:
# Convert PDFs to Images
files = find_files_ignore_case('*.pdf')
for pdf_file in files:
    pdf_file = os.path.join(cur_folder, pdf_file)
    print(pdf_file)
    folder = convert_pdf_to_images(pdf_file)

d:\GoogleDrive\Learn-Python\Typora to Word\O Level E Math Mid-Year Exam Paper 1_QP.pdf
d:\GoogleDrive\Learn-Python\Typora to Word\O Level E Math Mid-Year Exam Paper 2_QP.pdf


### Crop Images

In [6]:
# Crop images
files = find_files_ignore_case('*.pdf')
for file in files:
    folder = os.path.splitext(file)[0]
    print(folder)
    images = find_files_ignore_case('*', folder)
    for image_file in images:
        image_file = os.path.join(folder, image_file)
        print(image_file)
        crop_image_center(image_file, crop_left=190, crop_right=-40, crop_top=100, crop_bottom=50)


O Level E Math Mid-Year Exam Paper 1_QP
O Level E Math Mid-Year Exam Paper 1_QP\Z00001.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00002.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00003.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00004.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00005.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00006.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00007.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00008.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00009.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00010.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00011.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00012.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00013.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00014.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00015.jpg
O Level E Math Mid-Year Exam Paper 1_QP\Z00016.jpg
O Level E Math Mid-Year Exam Paper 2_QP
O Level E Math Mid-Year Exam Paper 2_QP\Z00001.jpg
O Level E Math Mid-Year Exam Paper 2_QP\Z00002.jpg
O 

### Copy Image *.jpg From Reference to Folder

In [22]:
# Copy Image *.jpg From Reference to Folder
files = find_files_ignore_case('*.pdf')
for file in files:
    print(file)
    folder = os.path.splitext(file)[0]
    file_prefix = get_file_name_prefix(file)
    print(file_prefix)

    # Copy Image *.jpg From Reference to Folder
    source_files = find_files_ignore_case('{}*.jpg'.format(file_prefix), 'Reference')

    for f in source_files:
        f = os.path.join('Reference', f)
        shutil.copy(f, folder)

O Level E Math Mid-Year Exam Paper 1_QP.pdf
'o level e math mid-year exam paper 1_qp.pdf' 'o level e math'
O Level E Math
O Level E Math Mid-Year Exam Paper 2_QP.pdf
'o level e math mid-year exam paper 2_qp.pdf' 'o level e math'
O Level E Math


### Insert Images to Word

In [30]:
# Insert Images to Word
files = find_files_ignore_case('*.pdf')
for file in files:
    folder = os.path.splitext(file)[0] 
    word_file = folder+".docx"

    document = Document()

    # Copy from template docx
    file_prefix = get_file_name_prefix(file)
    files = find_files_ignore_case('{}*.docx'.format(file_prefix), 'Reference')
    if files:
        document = Document(os.path.join('Reference', files[0]))
        document.add_section()
    else:
        document = Document()
    document.save(word_file)

    section = document.sections[0]
#         width = section.page_width - section.left_margin - section.right_margin
    height = section.page_height - section.top_margin - section.bottom_margin

    images = find_files_ignore_case('*', folder)
    for image_file in images:
        image_file = os.path.join(folder, image_file)
#         document.add_picture(image_file, width=width)
        document.add_picture(image_file, height=height)

    document.save(word_file)

'o level e math mid-year exam paper 1_qp.pdf' 'o level e math'
'o level e math mid-year exam paper 2_qp.pdf' 'o level e math'
