### Extracting Text From Pdfs

In [7]:
import PyPDF2 as pypdf2
pdf_file = open('meetingminutes.pdf', 'rb')
pdf_reader = pypdf2.PdfFileReader(pdf_file)
pdf_reader.numPages

19

In [8]:
page = pdf_reader.getPage(0)
page.extractText()

'OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS   Meeting of \nMarch 7\n, 2014\n        \n     The Board of Elementary and Secondary Education shall provide leadership and \ncreate policies for education that expand opportunities for children, empower \nfamilies and communities, and advance Louisiana in an increasingly \ncompetitive glob\nal market.\n BOARD \n of ELEMENTARY\n and \n SECONDARY\n EDUCATION\n  '

In [11]:
pdf_reader.getPage(18).extractText()

'BOARD MINUTES\n -18- MARCH 7, 2014\n    Amendments for BESE\n-Authorized Charter Schools\n; §1905. Non\n-Material \nAmendments for BESE\n-Authorized Charter\n Schools\n; §2301. State \nFunding\n; §2303. Federal Funding\n; §2713. At\n-Risk Students\n; §2907.  \nLeave of Absence\n; and §2909. Employee Benefits, as presented by the \nLDE.\n  Dr. Beebe and Ms. Hill were recorded as being opposed to the motion.\n  Agenda\n Item 6.\n Board A\ndvisory Council Reports\n   Agenda\n Item 6.1.\n Nonpublic School Council\n (Schedule 6)\n  On motion of Dr. Miranti, seconded by Mr. Guillot, the Board received the \nminutes of the Nonpublic School Council meeting held February 4, 2014, \nand approved the tentative a\ngenda for March 25, 2014.\n  Agenda\n Item 6.2.\n Superintendents™ Advisory Council\n (Schedule 7)\n  On motion of \nDr. Miranti,\n seconded by \nMr. Guillot,\n the Board received the \nminutes of the\n Superintendents™ Advisory Council meeting held \nFebruary\n 13, 2014, and appr\noved

### Decrypting PDFs

In [16]:
pdf_reader = pypdf2.PdfFileReader(open('encrypted.pdf', 'rb'))
pdf_reader.isEncrypted

True

In [17]:
pdf_reader.decrypt('rosebud')

1

In [20]:
page = pdf_reader.getPage(0)
page.extractText()

'OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS   Meeting of \nMarch 7\n, 2014\n        \n     The Board of Elementary and Secondary Education shall provide leadership and \ncreate policies for education that expand opportunities for children, empower \nfamilies and communities, and advance Louisiana in an increasingly \ncompetitive glob\nal market.\n BOARD \n of ELEMENTARY\n and \n SECONDARY\n EDUCATION\n  '

### Copying PDFs

In [21]:
pdf_1 = open('meetingminutes.pdf', 'rb')
pdf_2 = open('meetingminutes2.pdf', 'rb')
pdf_1_reader = pypdf2.PdfFileReader(pdf_1)
pdf_2_reader = pypdf2.PdfFileReader(pdf_2)
pdf_writer = pypdf2.PdfFileWriter()

for page_num in range(pdf_1_reader.numPages):
    page = pdf_1_reader.getPage(page_num)
    pdf_writer.addPage(page)
    
for page_num in range(pdf_2_reader.numPages):
    page = pdf_2_reader.getPage(page_num)
    pdf_writer.addPage(page)
    
pdf_output = open('combined_minutes.pdf', 'wb')
pdf_writer.write(pdf_output)
pdf_output.close
pdf_1.close()
pdf_2.close()

### Rotating Pages

In [24]:
minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = pypdf2.PdfFileReader(minutesFile)
page = pdfReader.getPage(0)
page.rotateClockwise(90)

pdfWriter = pypdf2.PdfFileWriter()
pdfWriter.addPage(page)
resultPdfFile = open('rotatedPage.pdf', 'wb')
pdfWriter.write(resultPdfFile)
resultPdfFile.close()
minutesFile.close()

## Project: Combining Select Pages from Many PDFs

In [6]:
# %load combine_pdfs.py
#!/usr/bin/env python3
"""
Created on Sat Aug 24 15:47:49 2019

@author: Soo Hyeon Kim
- Find all pdf files in the current working directory
- sort the filenames so the pdfs are added in order
- write an each page, excluding the first page, of each pdf to the output file.
"""

import PyPDF2 as pypdf2
import os

# Get all the PDF filenames
pdf_files = []
for filename in os.listdir('.'):
    if filename.endswith('.pdf'):
        pdf_files.append(filename)
        
# sort in alphabetical order
pdf_files.sort(key=str.lower)

# to write pdf that merged all pdfs
pdf_writer = pypdf2.PdfFileWriter()

# TODO: loop through all the pdf files
for filename in pdf_files:
    pdf = open(filename, 'rb')
    pdf_reader = pypdf2.PdfFileReader(pdf)

    # TODO: loop through all the pages (except the first) and add them.
    for page_num in range(1, pdf_reader.numPages):
        page = pdf_reader.getPage(page_num)
        pdf_writer.addPage(page)

# TODO: save the resulting PDF to a file.
save_name = 'allminutes.pdf'
with open(save_name, 'wb') as pdf_output:
    pdf_writer.write(pdf_output)

print("Done. {} is created for your sake".format(save_name))

Done. allminutes.pdf is created for your sake


# Word Documents
### Reading Word Documents

In [13]:
import docx
doc = docx.Document('demo.docx')
len(doc.paragraphs)

7

In [17]:
doc.paragraphs[0].text

'Document Title'

In [18]:
doc.paragraphs[1].text

'A plain paragraph with some bold and some italic'

In [19]:
len(doc.paragraphs[1].runs)

5

In [20]:
for run in doc.paragraphs[1].runs:
    print(run.text)

A plain paragraph with
 some 
bold
 and some 
italic


### Getting the full text from a .docx file

In [27]:
# %load read_docx.py
#!/usr/bin/env python3
"""
Created on Sat Aug 24 16:44:35 2019

@author: Soo Hyeon Kim
"""

import docx

def get_text(filename):
    doc = docx.Document(filename)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

In [33]:
print(get_text('demo.docx'))

Document Title
A plain paragraph with some bold and some italic
Heading, level 1
Intense quote
first item in unordered list
first item in ordered list




### Creating Word Documents with Nondefault Styles

In [34]:
doc = docx.Document('demo.docx')
doc.paragraphs[0].style

_ParagraphStyle('Title') id: 140214893076264

In [35]:
doc.paragraphs[0].style = 'Normal'

In [36]:
doc.paragraphs[1].text

'A plain paragraph with some bold and some italic'

In [39]:
doc.paragraphs[1].runs[0].style = 'QuoteChar'
doc.paragraphs[1].runs[1].underline = True
doc.paragraphs[1].runs[3].underline = True
doc.save('restyled.docx')



### Writing word documents

In [40]:
doc = docx.Document()
doc.add_paragraph('Hello World!')

<docx.text.paragraph.Paragraph at 0x7f8648444828>

In [41]:
doc.save('helloworld.docx')

# Practice Projects

## PDF Paranoia

### pdf encrypter

In [56]:
# %load pdf_encrypter.py
#!/usr/bin/env python3
"""
Created on Sat Aug 24 22:22:21 2019

@author: Soo Hyeon Kim
Finds all pdfs in a folder and encrypt th epdfs using a password provided 
on the command line with '_encrypted.pdf' suffix added to the original file. 
Get rid of original files in the end. 
"""

import os, sys
import PyPDF2 as pypdf2

try:
    password = sys.arg[1].strip()
except:
    password = input("You haven't provided password. What is it? ").strip()


while True:
        root = input("What location do you want to search through?").strip()
        root = os.path.abspath(root)
        
        if os.path.exists(root): # make sure it is legitimate
            break
        else:
            print("Please input root directory in relative or absolute path")



encrypt_failed = [] # in case failure

for folder, subfolders, filenames in os.walk(root):
    
    for filename in filenames:
        if filename.endswith('.pdf'):
            path = os.path.join(folder, filename)
            path_ = open(path, 'rb')
            try:
                pdf_reader = pypdf2.PdfFileReader(path_)
            except:
                continue
            
            # file shouldn't be encrypted to be read
            if not pdf_reader.isEncrypted:
                pdf_writer = pypdf2.PdfFileWriter()
                for page_num in range(pdf_reader.numPages):
                    pdf_writer.addPage(pdf_reader.getPage(page_num))
                
                # Encrypt copy of pdf and save with _encrypted suffix
                pdf_writer.encrypt(password)
                encrypted_path = path[:-4] + '_encrypted.pdf'
                encrypted_version = open(encrypted_path, 'wb')
                
                pdf_writer.write(encrypted_version)
                encrypted_version.close()
                
                # check file was encrypted properly
                encrypted_path_ = open(encrypted_path, 'rb')
                pdf_reader2 = pypdf2.PdfFileReader(encrypted_path_)
                if (pdf_reader2.isEncrypted) \
                        and (pdf_reader2.decrypt(password)):
                    os.remove(path)
                else:
                    encrypt_failed.append(filename)
                
            path_.close()
            
if encrypt_failed:
    print("The following files failed their encryption checks and were"
          " not deleted: ")
    for filename in encrypt_failed:
        print(filename)
else:
    print("All PDF's in the folder tree have been successfully encrypted.")
    print("Original files are deleted")

You haven't provided password. What is it? peachtree123
What location do you want to search through?.
All PDF's in the folder tree have been successfully encrypted.
Original files are deleted


### Pdf decrypter

In [58]:
# %load pdf_decrypter.py
#!/usr/bin/env python3
"""
Created on Sat Aug 24 23:41:21 2019

@author: Soo Hyeon Kim
Finds all pdfs in a folder and deencrypt th epdfs using a password provided 
on the command line with '_decrypted.pdf' suffix removed to the original file. 
Get rid of original files in the end. 
"""

import os, sys
import PyPDF2 as pypdf2

try:
    password = sys.arg[1].strip()
except:
    password = input("You haven't provided password. What is it? ").strip()


while True:
        root = input("What location do you want to search through?").strip()
        root = os.path.abspath(root)
        
        if os.path.exists(root): # make sure it is legitimate
            break
        else:
            print("Please input root directory in relative or absolute path")



decrypt_failed = [] # in case failure

for folder, subfolders, filenames in os.walk(root):
    
    for filename in filenames:
        if filename.endswith('.pdf'):
            path = os.path.join(folder, filename)
            path_ = open(path, 'rb')
            try:
                pdf_reader = pypdf2.PdfFileReader(path_)
            except:
                continue

            if pdf_reader.isEncrypted is True:
                if not pdf_reader.decrypt(password):
                    print(filename + ' failed to decrypt.')
                    decrypt_failed.append(filename)
                else:
                    pdf_writer = pypdf2.PdfFileWriter()
                    for page_num in range(pdf_reader.numPages):
                        pdf_writer.addPage(pdf_reader.getPage(page_num))

                    # Encrypt copy of PDF and save with _encrypted suffix
                    decrypted_path = path[:-4] + '_decrpyted.pdf'
                    decrypted_version = open(decrypted_path, 'wb')
                    pdf_writer.write(decrypted_version)
                    decrypted_version.close()

if decrypt_failed != []:
    print("All encrypted PDF's, except those listed above, were "
          "decrypted successfully. All of the original files have been kept.")
else:
    print("All encrypted PDF's in the folder tree were decrypted successfully. "
          "The original files have been kept.")


You haven't provided password. What is it? peachtree123
What location do you want to search through?.
All encrypted PDF's in the folder tree were decrypted successfully. The original files have been kept.


## Custom Invitation as Word Documents

In [61]:
# %load custom_invitation_docx.py
#!/usr/bin/env python3
"""
Created on Sat Aug 24 23:57:11 2019

@author: Soo Hyeon Kim
Read guest names from text file and generate a Word document with custom 
invitations.
"""

import docx
from docx.enum.text import WD_ALIGN_PARAGRAPH

with open('guest.txt') as f:
    names = f.readlines()
    document = docx.Document()
    
    for name in names:
        name = name.strip()
        
        paragraph = document.add_paragraph('It would be a pleasure to have the' 
                               ' company of', style='Heading1')
        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        paragraph = document.add_paragraph(name, style='Caption')
        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        paragraph = document.add_paragraph('at 11010 Memory Lane on '
                                           'the Evening of', style='Heading1')
        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        paragraph = document.add_paragraph('April 1st', style='BodyText')
        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        paragraph = document.add_paragraph('at 7 o\'clock', style='Heading1')
        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        document.add_page_break()
    
    document.save('invites.docx')
    
    print("'invites.docx' has been created")


'invites.docx' has been created




### Brute-Force PDF Password Breaker

In [63]:
# %load password_breaker.py
#!/usr/bin/env python3
"""
Created on Sun Aug 25 00:23:34 2019

@author: Soo Hyeon Kim
Finding a single english password. 
"""

import PyPDF2, os

print('Enter the directory of the PDF you wish to break:')
file = input()
file = os.path.abspath(file)

with open('dictionary.txt') as f:
    words = f.readlines()

    pdf_reader = PyPDF2.PdfFileReader(open(file, 'rb'))

    for word in words:
        word = word.strip()
        lower = word.lower()
        upper = word.upper()
        if pdf_reader.decrypt(lower) == 1:
            print('Password = ' + lower)
            break
        elif pdf_reader.decrypt(upper) == 1:
            print('Password = ' + upper)
            break

Enter the directory of the PDF you wish to break:
allminutes_encrypted.pdf
