In [None]:
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams

laparams = LAParams(line_margin = 0.2, boxes_flow = 0.6, detect_vertical = True, all_texts = True)

text = extract_text("2023-2024_Sustainability_Report_(ENG)_.pdf", laparams = laparams)
print(text)


# Bad method because it scrapes line by line in each page. 
# For paragraphs that are organised in chunks from left to right, pdf reader would still read from left to right, instead of reading by paragraphs

In [30]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path,output_path, boxes_flow = 0.2):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams(boxes_flow=boxes_flow)
    device = TextConverter(rsrcmgr, retstr, codec=codec , laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos=set()

    for PageNumer,page in enumerate(PDFPage.get_pages(fp, pagenos , password=password,caching=caching, check_extractable=True)):
        interpreter.process_page(page)

    text = retstr.getvalue()

    with open(output_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

    fp.close()
    device.close()
    retstr.close()
    return text

input_pdf_path = "/Users/jeremiahtay/DSA3101/LG_2023_ocr.pdf"
output_txt_path = "/Users/jeremiahtay/DSA3101/datadog_Sustainability Report.txt"

converted_file = convert_pdf_to_txt(input_pdf_path, output_txt_path)
print(converted_file)


2023  —M—W—— LG Electronics Sustainability Report  ——————_  2024 

LIFE 
FOR ALL 

Lite’s 
Good.

2 

2023-2024 LG Electronics Sustainability Report 

Environmental 

Social 

Governance 

ESG Data 

Appendix 

oO  eG  <  >) 

Table of Contents 

Overview 

Environmental 

12  =  Social 

29 

Governance 

80   ESG Data 

100 

Appendix 

Message from the CEO 

Company Overview 

ESG Strategy 

3 

4 

8 

Environmental Management Policy  13 

Human Rights 

30  ~~ Corporate Governance 

81 

Sustainability Management Data 

101 

GRI Index 

Addressing Climate Change 

Resource Circulation 

15 

18 

Employee 

Supply Chain 

44 

—Jeong-Do Management 

86 

Membership-Awards & Recognition  124  = SASB Index 

56 

Compliance Management  88 

 SHEE-Quality Certification Status  125  = UNSDGs Index 

Sustainable Business 

10. 

Product Stewardship 

23 

Customer 

66  —  Risk Management 

Business Sites Operation 

26  — Local Community 

75 

Information Security 

92 

95. 

 Mat

### Pros and Cons:

Pros:
- It preserves the document structure, processing PDFs at a granular level.
- It provides access to text positioning, fonts, and layout details.
- It works entirely in Python, don't need any third-party OCR or GUI-based tools.

Cons: 
- PDFMiner is not an OCR reader, hence it cannot capture texts on scanned images of text

## Using PyPDF2.PdfReader

In [2]:
from PyPDF2 import PdfReader

pdf_path = r"/Users/jeremiahtay/DSA3101/2023-2024_Sustainability_Report_(ENG)_.pdf"

# Open the PDF file
with open(pdf_path, 'rb') as f:
    pdf = PdfReader(f)  # Updated from PdfFileReader to PdfReader
    information = pdf.metadata  # Updated from getDocumentInfo()
    number_of_pages = len(pdf.pages)  # Updated from getNumPages()
    print(information)

# Extract text from the PDF
text = ''
pdfReader = PdfReader(pdf_path)  # Open PDF using PdfReader
for i in range(len(pdfReader.pages)):  # Iterate through pages
    pageObj = pdfReader.pages[i]  # Access page
    text += pageObj.extract_text()  # Use extract_text() instead of extractText()

print(text)


{'/CreationDate': "D:20240808102920+09'00'", '/Creator': 'Adobe InDesign 15.1 (Macintosh)', '/ModDate': "D:20240808103106+09'00'", '/Producer': 'Adobe PDF Library 15.0', '/Trapped': '/False'}
2023 2024 LG Electronics Sustainability Report 2 Overview Environmental Social Governance ESG Data Appendix 2023-2024 LG Electronics Sustainability Report 
Report Overview
LG Electronics has expressed its commitment to making changes that will help it to achieve sustainable growth and 
a better life for all, and has established and implemented goals and action plans to realize them. In the Sustainability 
Report, LG Electronics includes its management approach (MA), goals, activities, progress, and performance on tasks and 
objectives in the areas of environment, society, and governance. These are linked to the material issues derived from the 
materiality assessment with LG Electronics' six ESG strategic tasks. LG Electronics will continue to transparently disclose 
its sustainability management 

### Pros and Cons (PyPDF2):

Pros:
- It works well if the document is single-column, no table (fast and effective). But often times this is not the case.
- You can rotate pages, split PDF into multiple pages, merge multiple PDFs.

Cons: 
- PDFMiner is not an OCR reader, hence it cannot capture texts on scanned images of text
- Slower than pdfminer
- Inconsistent Extraction Accuracy

## Pytersseract

In [25]:
from pdf2image import convert_from_path
import cv2
import numpy as np
import pytesseract


# Replace 'input_file.pdf' with the path to your PDF file
input_pdf_path = "/Users/jeremiahtay/DSA3101/2023-2024_Sustainability_Report_(ENG)_.pdf"
pages = convert_from_path(input_pdf_path)

def deskew(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    coords = np.column_stack(np.where(gray > 0))
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated


def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

# Create a list to store extracted text from all pages
extracted_text = []

for page in pages:
    # Step 2: Preprocess the image (deskew)
    preprocessed_image = deskew(np.array(page))

    # Step 3: Extract text using OCR
    text = extract_text_from_image(preprocessed_image)
    extracted_text.append(text)


def process_page(page):
    try:
        # Transfer image of pdf_file into array
        page_arr = np.array(page)
        # Transfer into grayscale
        page_arr_gray = cv2.cvtColor(page_arr, cv2.COLOR_BGR2GRAY)
        # Deskew the page
        page_deskew = deskew(page_arr_gray)
        # Cal confidence value
        page_conf = get_conf(page_deskew)
        # Extract string
        d = pytesseract.image_to_data(page_deskew, output_type=pytesseract.Output.DICT)
        d_df = pd.DataFrame.from_dict(d)
        # Get block number
        block_num = int(d_df.loc[d_df['level'] == 2, 'block_num'].max())
        # Drop header and footer by index
        header_index = d_df[d_df['block_num'] == 1].index.values
        footer_index = d_df[d_df['block_num'] == block_num].index.values
        # Combine text in dataframe, excluding header and footer regions
        text = ' '.join(d_df.loc[(d_df['level'] == 5) & (~d_df.index.isin(header_index) & ~d_df.index.isin(footer_index)), 'text'].values)
        return page_conf, text
    except Exception as e:
        # If can't extract then give some notes into df
        if hasattr(e, 'message'):
            return -1, e.message
        else:
            return -1, str(e)


combined_text = " ".join(extracted_text)
output_file_path = "/Users/jeremiahtay/DSA3101/LG_Sustainability Report.txt"

# Open the file in write mode and save the text
with open(output_file_path, "w") as file:
    file.write(combined_text)

print(combined_text)



LG Electronics Sustainability Report

BETTER

LIFE

FOR ALL

Life’s
Good.
 cs Sustainability Report [overview Environmental Social Governance ESG Data AY
ts

Environmental 12 ~~ Social 29 Governance 80 ESG Data 100 = Appendi:

3 Environmental Management Policy 13 Human Rights 30 Corporate Governance 81 Sustainability Management Data 101 GRI Inde»

4 Addressing Climate Change 15 Employee 44 — Jeong-Do Management 86 = Membership-Awards & Recognition 124 — SASB Ind

8 Resource Circulation 18 Supply Chain 56 Compliance Management 88  SHEE-Quality Certification Status 125 UN SDGs

10 Product Stewardship 23. Customer 66 — Risk Management 92 Materiality Assessment 126 ~—TCFD Ref

Business Sites Operation 26 ~~ Local Community 75 Information Security 95 Stakeholder Communication 129 Greenhot

Statemer

Independ

its commitment to making changes that will help it to achieve sustainable growth and
tablished and implemented goals and action plans to realize them. In the Sustainability
its managem

### Pros and Cons (PyTesseract):

Pros:
- OCR scanner, which can extract text from scanned PDFs
- Best choice for images, scanned documents, and non-text PDFs.
- Can handle rotated or skewed texts

Cons: 
- OCR involves image processing, hence it is slower than pure text extraction
- It does not preserve formatting (text might be out of order)
- Table and multi-column layouts may be extracted incorrectly.
- Pytesseract must convert PDFs to image first using pdf2image


In [24]:
from pdf2image import convert_from_path
import cv2
import numpy as np
import pytesseract
import pandas as pd  # Ensure Pandas is imported

# Replace with your actual PDF path
input_pdf_path = "/Users/jeremiahtay/DSA3101/2023-2024_Sustainability_Report_(ENG)_.pdf"
output_file_path = "/Users/jeremiahtay/DSA3101/LG_Sustainability_Report.txt"

# Convert PDF pages to images
pages = convert_from_path(input_pdf_path)

def deskew(image):
    """Corrects skew in an image using OpenCV."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    coords = np.column_stack(np.where(gray > 0))
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

def get_conf(image):
    """Calculates the OCR confidence level of the image."""
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    conf_values = [int(conf) for conf in ocr_data["conf"] if conf.isdigit()]
    return sum(conf_values) / len(conf_values) if conf_values else 0

def process_page(page):
    """Processes an image page: Deskews, extracts text, and removes headers/footers."""
    try:
        page_arr = np.array(page)
        page_arr_gray = cv2.cvtColor(page_arr, cv2.COLOR_BGR2GRAY)
        page_deskew = deskew(page_arr_gray)
        page_conf = get_conf(page_deskew)
        
        # Extract OCR data
        d = pytesseract.image_to_data(page_deskew, output_type=pytesseract.Output.DICT)
        d_df = pd.DataFrame.from_dict(d)
        
        # Get block number
        block_num = int(d_df.loc[d_df['level'] == 2, 'block_num'].max())
        
        # Drop header and footer by index
        header_index = d_df[d_df['block_num'] == 1].index.values
        footer_index = d_df[d_df['block_num'] == block_num].index.values
        
        # Combine text in dataframe, excluding header and footer regions
        text = ' '.join(d_df.loc[(d_df['level'] == 5) & (~d_df.index.isin(header_index) & ~d_df.index.isin(footer_index)), 'text'].values)
        return page_conf, text
    except Exception as e:
        return -1, str(e)

# Extract text from all pages using process_page
extracted_text = []
for page in pages:
    conf, text = process_page(page)
    extracted_text.append(text)

# Combine extracted text
combined_text = "\n".join(extracted_text)

# Save to file
with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(combined_text)

print("Extraction complete. Text saved to:", output_file_path)


Extraction complete. Text saved to: /Users/jeremiahtay/DSA3101/LG_Sustainability_Report.txt


In [20]:
combined_text = " ".join(extracted_text)
output_file_path = "/Users/jeremiahtay/DSA3101/LG_Sustainability Report.txt"

# Open the file in write mode and save the text
with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(combined_text)

print(combined_text)

LG Electronics Sustainability Report

BETTER

LIFE

FOR ALL

Life’s
Good.
 cs Sustainability Report [overview Environmental Social Governance ESG Data AY
ts

Environmental 12 ~~ Social 29 Governance 80 ESG Data 100 = Appendi:

3 Environmental Management Policy 13 Human Rights 30 Corporate Governance 81 Sustainability Management Data 101 GRI Inde»

4 Addressing Climate Change 15 Employee 44 — Jeong-Do Management 86 = Membership-Awards & Recognition 124 — SASB Ind

8 Resource Circulation 18 Supply Chain 56 Compliance Management 88  SHEE-Quality Certification Status 125 UN SDGs

10 Product Stewardship 23. Customer 66 — Risk Management 92 Materiality Assessment 126 ~—TCFD Ref

Business Sites Operation 26 ~~ Local Community 75 Information Security 95 Stakeholder Communication 129 Greenhot

Statemer

Independ

its commitment to making changes that will help it to achieve sustainable growth and
tablished and implemented goals and action plans to realize them. In the Sustainability
its managem

In [None]:
combined_text = " ".join(extracted_text)
output_file_path = "/Users/jeremiahtay/DSA3101/LG_Sustainability Report.txt"

# Open the file in write mode and save the text
with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(combined_text)

print(combined_text)

In [15]:
combined_text

