# Unlocking Troublesome PDFs


# Importing files

We can use an import library specific to Colab

## *WARNING*: These are temporary uploads. When you restart, you need to reupload.

```from google.colab import files```

```myupload = files.upload()```

In [None]:
from google.colab import files
files.upload()

## Using a path

We can store our path structure to a variable.

Right-click on the folder in the left column and copy path:
```/content/sample_data```

This is the raw path. We are already in ```content``` so instead we want:
```sample_data``` plus what files we are looking for

In [None]:
## install PyPDF2 as part of our process
## It's not part of the standard library.
!pip install pypdf2

In [None]:
import PyPDF2
## in order to export our file to our computer drive, you need this only in Colab:
from google.colab import files

In [None]:
## import colab's file uploader
## upload 
files.upload()

In [None]:
## read and store document in an object
pdfFileObj = open("nixon-memo2.pdf", "rb")

In [None]:
type(pdfFileObj)

In [None]:
## use pypdf to read that object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

In [None]:
type(pdfReader)

In [None]:
## let's see how many pages we have
number_of_pages = pdfReader.numPages
number_of_pages

In [None]:
## read a single page 
pageObj = pdfReader.getPage(0)
pageObj.extractText()

In [None]:
## read all pages and store in a list
all_pages = []
for page_number in range(number_of_pages):
  page = pdfReader.getPage(page_number)
  all_pages.append(page.extractText())

all_pages

In [None]:
type(all_pages)

In [None]:
## print all pages so we can see them
for page in all_pages:
  print(page)

## An even more obnoxious PDF

In [None]:
## import colab's file uploader
## upload columbus_bank_trust.pdf
files.upload()

In [None]:
## place in an object
pdfFileObj2 = open('columbus_bank_trust.pdf', 'rb')

In [None]:
## read it using PyPDF
pdfReader2 = PyPDF2.PdfFileReader(pdfFileObj2)

In [None]:
## how many pages?
pdfReader2.numPages

In [None]:
## extract a page
pageObj2 = pdfReader2.getPage(1)
columbia_text = pageObj2.extractText()

In [None]:
## read text
columbia_text

# Strategy to Vanquish Obnoxious PDFs


### The problem:
*   PDFs all have different encodings: UTF-8, ASCII, Unicode, etc
*   Therefore a possible loss of data during the coversion 

### The solution:
*   Convert the PDF to an image
*   Use optical character recognition (OCR) to capture the text
*   Export to a text file





### Let's use Google's tesseract OCR (and all its dependencies)

In [None]:
## We need pytesseract (not part of standard library) to wrap around Google's tesseract-ocr
!pip install pytesseract

In [None]:
## need google tesseract-ocr

!apt install tesseract-ocr

In [None]:
## Poppler enables you to convert .pdf files to .txt
## but not part of standard library

!apt-get install poppler-utils 

In [None]:
## Convert PDF to a PIL Image object
!pip install pdf2image

In [None]:
# Import libraries
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import glob
## in order to export our file to our computer drive, you need this only in Colab:
from google.colab import files

In [None]:
## import colab's file uploader
## upload 
files.upload()

In [None]:
PDF_file = "columbus_bank_trust.pdf"

In [None]:
## Take your PDF and convert each page to a JPEG

# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500) ## 500 dpi. balance between hi-res and computation power
print(pages)
print(f"Give me a second...converting '{PDF_file}' to a JPEG")

# Counter to store images of each page of PDF to image
image_counter = 1

# Iterate through all the pages stored above
for page in pages:
    ## Declaring filename for each page of PDF as JPG
    ## remove the .pdf extention (last 4 characters)
    filename = str(PDF_file[ : -4])+"_page_"+str(image_counter)+".jpg"
    print(f"Here we go: {filename}")
    print(type(filename))
    # Save the image of the page in system
    page.save(filename, 'JPEG')
    # Increment the counter to update filename
    image_counter+=1

In [None]:
!pwd

In [None]:
## path to the jpegs that were just produced

img_path = "columbus*.jpg"
myfiles = sorted(glob.glob(img_path))
myfiles

In [None]:
outfile = "columbusBank_digital_conversion.txt"

In [None]:
## Open each Jpeg, OCR and append text to output file

with open(outfile, "a") as my_conversion_file:
  ## grab all jpegs
  img_files = sorted(glob.glob(img_path))
  # print(img_files)
  counter = 0
  for img_file in img_files:
    counter+=1
    print(f"Converting image {counter} of {len(img_files)} to text")
    text = f"\n\n\nFILE_Info: {img_file} \n\n\n + {str(pytesseract.image_to_string(Image.open(img_file)))}" 

  ## In many PDFs, at line ending, if a word can't
    ## be written fully, a 'hyphen' is added.
    ## To remove this, we replace every '-\n' to ''.
    text = text.replace('-\n', '')

    my_conversion_file.write(text)

print(f"Your converted PDFs are waiting in {outfile}!")

# OCRing languages other than English.

English is the default language for Tesseract. 

The installation process is confusing and cumbersome. I have have tried to make it as clear as possible here in *Colab*. 

Installation in Jupyter Notebooks is a little different and so is installation for a virtual environment.

[More here](https://askubuntu.com/questions/793634/how-do-i-install-a-new-language-pack-for-tesseract-on-16-04
).



# Installations

In [None]:
## We need Tesseract-ocr, an optical character recognition engine (not part of standard library)
## But we use apt install because it is hosted on a Ubuntu repositories which are hosted by Canonical.

!apt install tesseract-ocr

In [None]:
## We need pytesserct (not part of standard library) to wrap around Google's tesseract-ocr
## pip install for packages on PyPi, hosted by the Python Software Foundation. 

!pip install pytesseract

In [None]:
## install tesseract which controls all the language libraries
!pip install tesseract

Convert PDF to a PIL Image object

In [None]:
!pip install pdf2image

In [None]:
## Poppler enables you to convert .pdf files to .txt
## but not part of standard library

!apt-get install poppler-utils 


In [None]:
# Import libraries
from PIL import Image
import pytesseract
# import sys
from pdf2image import convert_from_path
import os
import glob
## in order to export our file to our computer drive, you need this only in Colab:
from google.colab import files

In [None]:
## What languages do we have installed so far?
!tesseract --list-langs

# Let's try Spanish

In [None]:
## Get OCR for Spanish
!apt-get install tesseract-ocr-spa

In [None]:
## What languages do we have installed now?
!tesseract --list-langs

The Image module provides a class with the same name which is used to represent a PIL image. The module also provides a number of factory functions, including functions to load images from files, and to create new images. (credit: Image Module)

In [None]:
## import colab's file uploader
## upload 
files.upload()


In [None]:
PDF_file = "spanish.pdf"


In [None]:
## Take your PDF and convert each page to a JPEG

# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500) ## 500 dpi. balance between hi-res and computation power
print(pages)
print(f"Give me a second...converting '{PDF_file}' to a JPEG")

# Counter to store images of each page of PDF to image
image_counter = 1

# Iterate through all the pages stored above
for page in pages:
    ## Declaring filename for each page of PDF as JPG
    ## remove the .pdf extention (last 4 characters)
    filename = str(PDF_file[ : -4])+"_page_"+str(image_counter)+".jpg"
    print(f"Here we go: {filename}")

    # Save the image of the page in system
    page.save(filename, 'JPEG')

    # Increment the counter to update filename
    image_counter+=1

In [None]:
## path to the jpegs that were just produced
img_path = "spanish*.jpg"
myfiles = sorted(glob.glob(img_path))
myfiles

In [None]:
## The text pulled from the jpegs will be appended to this file
outfile = "spanish_TEXT.txt"

In [None]:
## Open each Jpeg, OCR and append text to output file

##REMEMBER TO CHANGE THE LANGUAGE

with open(outfile, "a") as my_convertion_file:
    ## grab all jpegs
    img_files = sorted(glob.glob(img_path))
    counter = 0
    for img_file in img_files:
        counter+=1
        print(f"Converting image {counter} of {len(img_files)} to text!")
        # text = f"\n\n\nFILE_Info: {img_file} \n\n\n + {str(pytesseract.image_to_string(Image.open(img_file)))}" 
        
        # ###UNCHECK for Arabic
        text = f"\n\n\nFILE_Info: {img_file} \n\n\n + {str(pytesseract.image_to_string(Image.open(img_file), lang='spa'))}"
        print(text)

        ## In many PDFs, at line ending, if a word can't
        ## be written fully, a 'hyphen' is added.
        ## To remove this, we replace every '-\n' to ''.
        text = text.replace('-\n', '')
        # print(f"Here's the text: \n {text}")

        my_convertion_file.write(text) ## write to my output file

# Let's try Chinese

In [None]:
## What languages do we have installed so far?
!tesseract --list-langs

In [None]:
!apt-get install tesseract-ocr-chi-tra

In [None]:
## import colab's file uploader
## upload 
files.upload()

In [None]:
PDF_file = "chinese.pdf"

In [None]:
## Take your PDF and convert each page to a JPEG

# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500) ## 500 dpi. balance between hi-res and computation power
print(pages)
print(f"Give me a second...converting '{PDF_file}' to a JPEG")

# Counter to store images of each page of PDF to image
image_counter = 1

# Iterate through all the pages stored above
for page in pages:
    ## Declaring filename for each page of PDF as JPG
    ## remove the .pdf extention (last 4 characters)
    filename = str(PDF_file[ : -4])+"_page_"+str(image_counter)+".jpg"
    print(f"Here we go: {filename}")

    # Save the image of the page in system
    page.save(filename, 'JPEG')

    # Increment the counter to update filename
    image_counter+=1

In [None]:
## path to the jpegs that were just produced
img_path = "chinese*.jpg"
myfiles = sorted(glob.glob(img_path))
myfiles

In [None]:
## The text pulled from the jpegs will be appended to this file
outfile = "chinese_TEXT.txt"

In [None]:
## Open each Jpeg, OCR and append text to output file

with open(outfile, "a") as my_convertion_file:
    ## grab all jpegs
    img_files = sorted(glob.glob(img_path))
    counter = 0
    for img_file in img_files:
        counter+=1
        print(f"Converting image {counter} of {len(img_files)} to text!")
        # text = f"\n\n\nFILE_Info: {img_file} \n\n\n + {str(pytesseract.image_to_string(Image.open(img_file)))}" 
        
        # ###UNCHECK for Arabic
        text = f"\n\n\nFILE_Info: {img_file} \n\n\n  {str(pytesseract.image_to_string(Image.open(img_file), lang='chi_tra'))}"
        print(text)

        ## In many PDFs, at line ending, if a word can't
        ## be written fully, a 'hyphen' is added.
        ## To remove this, we replace every '-\n' to ''.
        text = text.replace('-\n', '')
        # print(f"Here's the text: \n {text}")

        my_convertion_file.write(text) ## write to my output file

# Let's try Japanese


In [None]:
## get Japanese OCR
!apt-get install tesseract-ocr-jpn

In [None]:
## What languages do we have installed so far?
!tesseract --list-langs

In [None]:
## import colab's file uploader
## upload 
files.upload()

In [None]:
PDF_file = "japanese.pdf"


In [None]:
## Take your PDF and convert each page to a JPEG

# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500) ## 1000 dpi. balance between hi-res and computation power
print(pages)
print(f"Give me a second...converting '{PDF_file}' to a JPEG")

# Counter to store images of each page of PDF to image
image_counter = 1

# Iterate through all the pages stored above
for page in pages:
    ## Declaring filename for each page of PDF as JPG
    ## remove the .pdf extention (last 4 characters)
    filename = str(PDF_file[ : -4])+"_page_"+str(image_counter)+".jpg"
    print(f"Here we go: {filename}")

    # Save the image of the page in system
    page.save(filename, 'JPEG')

    # Increment the counter to update filename
    image_counter+=1

In [None]:
## path to the jpegs that were just produced
img_path = "japanese*.jpg"
myfiles = sorted(glob.glob(img_path))
myfiles

In [None]:
## The text pulled from the jpegs will be appended to this file
outfile = "japanese_TEXT.txt"

In [None]:
## Open each Jpeg, OCR and append text to output file

with open(outfile, "a") as my_convertion_file:
    ## grab all jpegs
    img_files = sorted(glob.glob(img_path))
    counter = 0
    for img_file in img_files:
        counter+=1
        print(f"Converting image {counter} of {len(img_files)} to text!")
        # text = f"\n\n\nFILE_Info: {img_file} \n\n\n + {str(pytesseract.image_to_string(Image.open(img_file)))}" 
        
        # ###UNCHECK for Arabic
        text = f"\n\n\nFILE_Info: {img_file} \n\n\n  {str(pytesseract.image_to_string(Image.open(img_file), lang='jpn'))}"
        print(text)

        ## In many PDFs, at line ending, if a word can't
        ## be written fully, a 'hyphen' is added.
        ## To remove this, we replace every '-\n' to ''.
        text = text.replace('-\n', '')
        # print(f"Here's the text: \n {text}")

        my_convertion_file.write(text) ## write to my output file