In [36]:
import pdfplumber
import re

In [37]:
def extract_text_from_pdf(pdf_path, start_page=47, end_page=69, page_numbers=None):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        if page_numbers:  # Extract text from specific page numbers
            for page_number in page_numbers:
                page = pdf.pages[page_number - 1]
                text += page.extract_text()
        elif start_page is not None and end_page is not None:  # Extract text from a range of pages
            for page_number in range(start_page, end_page + 1):
                page = pdf.pages[page_number - 1]
                text += page.extract_text()
        else:  # Extract text from all pages
            for page in pdf.pages:
                text += page.extract_text()
    return text


In [38]:
def extract_data(text):
    sinhala_pattern = r'[\u0D80-\u0DFF]+'  # Sinhala Unicode range
    tamil_pattern = r'[\u0B80-\u0BFF]+'  # Tamil Unicode range
    english_pattern = r'[A-Za-z]+'  # English alphabets

    sinhala_text = re.findall(sinhala_pattern, text)
    tamil_text = re.findall(tamil_pattern, text)
    english_text = re.findall(english_pattern, text)

    return sinhala_text, tamil_text, english_text

In [39]:
# Example usage
pdf_path = 'slnb/2020-2029/SLNB 2022-01-12/SLNB-2022-06 V.60 June.pdf'
text = extract_text_from_pdf(pdf_path, start_page=47, end_page=51)
sinhala_text, tamil_text, english_text = extract_data(text)

In [40]:
# Print extracted text
print("Sinhala Text:", sinhala_text)
print("Tamil Text:", tamil_text)
print("English Text:", english_text)

Sinhala Text: []
Tamil Text: []
English Text: ['SRI', 'LANKA', 'NATIONAL', 'BIBLIOGRAPHY', 'SUBJECT', 'SECTION', 'COMPUTER', 'SCIENCE', 'SOCIAL', 'SCIENCES', 'INFORMATION', 'GENERAL', 'WORKS', 'Communication', 'Mysteries', 'Critical', 'perspectives', 'on', 'open', 'development', 'empirical', 'interrogation', 'of', 'theory', 'Razeen', 'Muhammad', 'construction', 'ed', 'Arul', 'Chib', 'Caitlin', 'M', 'The', 'wonders', 'of', 'the', 'world', 'Muhammad', 'Bentley', 'and', 'Matthew', 'L', 'Smith', 'England', 'Razeen', 'tr', 'by', 'A', 'C', 'M', 'Wabanbey', 'The', 'MIT', 'Press', 'xi', 'p', 'cm', 'Kurunegala', 'Author', 'p', 'The', 'MIT', 'press', 'international', 'development', 'photos', 'cm', 'research', 'centre', 'series', 'Ad', 'Bc', 'Rs', 'NL', 'Ad', 'Bc', 'Unpriced', 'NL', 'ISBN', 'ISBN', 'Curiosities', 'and', 'wonders', 'Economic', 'development', 'Information', 'commons', 'Information', 'society', 'Information', 'technology', 'Social', 'aspects', 'RELIGION', 'Open', 'source', 'software

RuntimeError: Directory 'static/' does not exist

In [61]:
import fitz

def extract_text_from_page(pdf_path, page_number):
    try:
        # Open the PDF file
        doc = fitz.open(pdf_path)

        # Get the specified page
        page = doc[page_number - 1]  # Page numbers start from 0, so subtract 1

        # Extract text from the page
        text = page.get_text()

        # Close the PDF document
        doc.close()

        return text
    except Exception as e:
        print("Error:", e)
        return None

# Specify the PDF file path and the page number you want to extract text from
pdf_path = 'slnb/2020-2029/SLNB 2022-01-12/SLNB-2022-06 V.60 June.pdf'
page_number = 47  # Change this to the desired page number

# Extract text from the specified page
text_from_page = extract_text_from_page(pdf_path, page_number)

# Print the extracted text
if text_from_page:
    print(text_from_page)
else:
    print("Failed to extract text from the specified page.")


SRI  LANKA NATIONAL  BIBLIOGRAPHY
SUBJECT  SECTION  
0  0  0    COMPUTER  SCIENCE,  
                      INFORMATION, GENERAL 
                WORKS
001.94 - Mysteries
Razeen, Muhammad 
The wonders of the world / Muhammad 
Razeen ; tr. by A. C. M. Wabanbey. - 
Kurunegala : Author, 2019. - 56 p. : 
photos ; 21 cm. 
Ad-Bc : Rs. 350.00 
(1097524 NL)
ISBN 978-955-43511-7-2  
1. Curiosities and wonders 
2 0 0   RELIGION
2 9 0  OTHER RELIGIONS
294.3435095493 - Sri Lanka 
Somasundara, J. W. D. 
The most sacred Sri Pada / J. W. D. 
Somasundara and H. M. Jayantha 
Wijeratna ; tr. by Sunil Wijeyesinghe. - 
Colombo : S. Godage, 2022. - 128 p. : 
photos ; 22 cm. 
Ad-Bc : Rs. 950.00 
(1104779  NL)
ISBN 978-624-00-1311-6  
(477668 NA)
1. Adam's Peak (Sri Lanka)
2. Mountains-Sri Lanka 
3 0 0   SOCIAL SCIENCES
303.4833 - Communication
 
Critical perspectives on open development : 
empirical interrogation of theory 
construction / ed.  Arul Chib, Caitlin M. 
Bentley and Matthew L. Smith. - England : 

In [62]:
import re
from tabulate import tabulate

# Define regular expressions for extracting information
pattern = r"(\d{1,3}\.\d{2})\s-\s([\w\s,.'-]+)\s\/\s([\w\s,.'-]+)\s;\s([\w\s,.'-]+)\s:\s([\w\s,.'-]+)\s-\s(\d{4})\.\s-\s(\d+\s\w+)\s:\s(\w+\s;\s\d+\s\w+)[\s\n]+(?:\(.*?\))?\sISBN\s(\d{3}-\d{1,7}-\d{1,7}-\d{1,7}-\d{1,7})\s*(.*?)\s(\d+\s\w+)"
headers = ["Classification", "Title", "Author", "Publisher", "Publication Year", "Page Count", "Size", "Price", "ISBN", "Topics"]

# Extract information using regular expressions
matches = re.findall(pattern, text_from_page, re.DOTALL)

# Format extracted data into a tabulated form
table_data = []
for match in matches:
    table_data.append(list(match))

# Print the tabulated data
print(tabulate(table_data, headers=headers))


Classification    Title    Author    Publisher    Publication Year    Page Count    Size    Price    ISBN    Topics
----------------  -------  --------  -----------  ------------------  ------------  ------  -------  ------  --------
