In [2]:
!pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Collecting distro (from tabula-py)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Downloading tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.9.0 tabula-py-2.10.0


In [60]:
import tabula
import pandas as pd
import requests
import json
import tempfile
import re

def clean_table_data(tables):
    """
    Cleans the extracted tables by removing empty columns, handling missing data,
    and ensuring consistency across rows, without altering dot values.
    
    Args:
    tables (list of DataFrame): List of DataFrames representing extracted tables.

    Returns:
    list of DataFrame: Cleaned tables.
    """
    cleaned_tables = []

    for table in tables:
        # Drop completely empty columns
        table = table.dropna(axis=1, how='all')

        # Drop rows with all NaN values
        table = table.dropna(how='all')
        
        table = table.applymap(map_dots_to_yes_no)
        
        # Optionally, fill remaining NaN with empty strings or another placeholder
        table = table.fillna('')  # Customize this as needed

        # Filter out tables with insufficient data (for example, less than 2 columns)
        if len(table.columns) > 1:
            cleaned_tables.append(table)

    return cleaned_tables

def extract_table_from_url_to_json_with_auto_cleanup(pdf_url, pages='all'):
    """
    Extracts and cleans tables from a PDF URL and returns the data in JSON format,
    while using a secure temporary file that is automatically deleted.

    Args:
    pdf_url (str): URL to the PDF file.
    pages (str or int): Pages from which tables are extracted. 'all' for all pages or specific page numbers.

    Returns:
    str: JSON string of the cleaned extracted tables or an error message.
    """
    try:
        # Download the PDF file from the URL
        response = requests.get(pdf_url)
        if response.status_code != 200:
            return json.dumps({"error": f"Failed to download PDF, status code: {response.status_code}"}, indent=2)

        # Create a secure temporary file that is automatically deleted upon closing
        with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_pdf:
            temp_pdf.write(response.content)
            temp_pdf.flush()  # Ensure all data is written to the file before reading

            # Extract tables from the PDF using the temporary file
            tables = tabula.read_pdf(temp_pdf.name, pages=pages, multiple_tables=True, lattice=True)

            if not tables:
                return json.dumps({"error": "No tables found in the PDF."}, indent=2)

            # Clean the extracted tables (without changing dot values)
            cleaned_tables = clean_table_data(tables)

#             # Convert the cleaned list of DataFrames to JSON format
#             tables_json = [table.to_json(orient="records") for table in cleaned_tables]

#             # Return the JSON string
#             return json.dumps(tables_json, indent=2)
        
            # Convert the cleaned list of DataFrames to a string format
            tables_str = "\n\n".join([table.to_string(index=False) for table in cleaned_tables])

            # Return the string representation
            return tables_str
    
    except Exception as e:
        return json.dumps({"error": str(e)}, indent=2)

# Example usage
pdf_url = 'https://pirls2021.org/wp-content/uploads/2022/11/Exhibit-3-Official-Languages-and-Languages-of-Instruction.pdf'
json_output = extract_table_from_url_to_json_with_auto_cleanup(pdf_url, pages='all')  # Extract tables from page 1
print(json_output)

                                                     Country                                                         Unnamed: 0 Official Languages and Major Language Subgroups                                                                                                                                                                                 Unnamed: 1                                                 Languages of Instruction for Reading in the Fourth\rGrade                                                           Unnamed: 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

Exhibit 3 - Official Languages and Languages of Instruction: https://pirls2021.org/wp-content/uploads/2022/11/Exhibit-3-Official-Languages-and-Languages-of-Instruction.pdf
(good) Exhibit 4 - Status of the Fourth Grade Language/Reading Curriculum: https://pirls2021.org/wp-content/uploads/2022/11/Exhibit-4-Status-of-the-Fourth-Grade-Reading-Curriculum.pdf  
Exhibit 5 - Purposes for Reading Emphasized in the Intended Language/Reading Curriculum: https://pirls2021.org/wp-content/uploads/2022/11/Exhibit%205%20Purposes%20for%20Reading%20Emphasized%20in%20the%20Intended%20Language%20Reading%20Curriculum.pdf  
Exhibit 6 - Digital Reading Skills Emphasized in the Intended Language/Reading Curriculum: https://pirls2021.org/wp-content/uploads/2022/11/Exhibit%206%20Digital%20Reading%20Skills%20Emphasized%20in%20the%20Intended%20Reading%20Curriculum.pdf  
(good) Exhibit 7 - Policies/Statements about Digital Literacy in the Language/Reading Curriculum: https://pirls2021.org/wp-content/uploads/2022/11/Exhibit-7-Policies-About-Digital-Literacy-in-the-Reading-Curriculum.pdf

In [54]:
def map_dots_to_yes_no(value):
    """
    Maps filled or empty dots to 'yes', 'no', or 'some emphasis'.
    
    Args:
    value (str): The table cell value to check.
    
    Returns:
    str: 'yes', 'no', 'some emphasis', or the original value if it's not a dot.
    """
    if isinstance(value, str):
        # Map filled dots (e.g., '•', '●', '⚫') to 'yes'
        if re.match(r'[•●⚫\u26ab]', value):
            return 'yes'
        # Map empty dots (e.g., '◦', '·', '⚪') to 'no'
        elif re.match(r'[◦·⚪\u26aa]', value):
            return 'no'
        # Map symbols with emphasis (e.g., '⨀', '◉', '⊚') to 'some emphasis'
        elif re.match(r'[◉⊚⨀]', value):
            return 'some emphasis'
    return value

In [40]:
pages='all'
response = requests.get(pdf_url)
if response.status_code != 200:
    json.dumps({"error": f"Failed to download PDF, status code: {response.status_code}"}, indent=2)

# Create a secure temporary file that is automatically deleted upon closing
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_pdf:
    temp_pdf.write(response.content)
    temp_pdf.flush()  # Ensure all data is written to the file before reading

    # Extract tables from the PDF using the temporary file
    tables = tabula.read_pdf(temp_pdf.name, pages=pages, multiple_tables=True, lattice=True)
    # Clean the extracted tables (without changing dot values)
    cleaned_tables = clean_table_data(tables)

In [47]:
extract_table_with_camelot(pdf_url, pages='all')

'Error: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.'

In [48]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdfium2, pdfminer.six, pdfplumber
  Attempting uninstall: pdfminer.six
    Found existing installation: pdfminer.six 20240706
    Uninstalling pdfmine

In [51]:
import pdfplumber
import pandas as pd
import requests
import tempfile

def extract_table_with_pdfplumber(pdf_url, pages='all'):
    """
    Extracts tables from a PDF using pdfplumber for enhanced table recognition.

    Args:
    pdf_url (str): URL to the PDF file.
    pages (str or int): Pages from which tables are extracted. 'all' for all pages or specific page numbers.

    Returns:
    str: String representation of the extracted tables or an error message.
    """
    try:
        # Download the PDF
        response = requests.get(pdf_url)
        if response.status_code != 200:
            return f"Error: Failed to download PDF, status code: {response.status_code}"

        # Create a secure temporary file to store the PDF
        with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_pdf:
            temp_pdf.write(response.content)
            temp_pdf.flush()

            # Open the PDF using pdfplumber
            with pdfplumber.open(temp_pdf.name) as pdf:
                table_data = []
                # Check if pages is 'all', if not parse specific pages
                if pages == 'all':
                    pages_to_extract = range(len(pdf.pages))
                else:
                    pages_to_extract = [int(pages) - 1]

                # Iterate through the selected pages and extract tables
                for page_num in pages_to_extract:
                    page = pdf.pages[page_num]
                    tables = page.extract_table()
                    
                    if tables:
                        # Convert the extracted table to a DataFrame
                        df = pd.DataFrame(tables[1:], columns=tables[0])
                        table_data.append(df)

                if not table_data:
                    return "Error: No tables found in the PDF."

                # Convert the tables to a single string for output
                tables_str = "\n\n".join([table.to_string(index=False) for table in table_data])

            return tables_str

    except Exception as e:
        return f"Error: {str(e)}"

# Example usage
pdf_url = 'https://pirls2021.org/wp-content/uploads/2022/11/Exhibit%205%20Purposes%20for%20Reading%20Emphasized%20in%20the%20Intended%20Language%20Reading%20Curriculum.pdf'
table_string = extract_table_with_pdfplumber(pdf_url, pages='1')  # You can specify 'all' or a specific page number
print(table_string)


Country                                                     None None Reading to Improve\nReading Skills and\nComprehension None None Reading for Literary\nExperience None None Reading to Acquire\nInformation None None Reading for Enjoyment None None
                                                         Albania                                                                                                                                                                                      
                                                       Australia                                                                                                                                                                                      
                                                         Austria                                                                                                                                                                                   

In [53]:
import pdfplumber
import pandas as pd
import requests
import tempfile
import re

# Function to map different circle types
def map_dots_to_values(value):
    """
    Maps different types of circles (filled, empty, and semi-filled) to specific values.
    
    Args:
    value (str): The table cell value to check.
    
    Returns:
    str: 'yes', 'no', 'maybe', or the original value if it's not a circle.
    """
    if isinstance(value, str):
        # Map filled circles (e.g., '⚫', '●') to 'yes'
        if re.match(r'[⚫●]', value):
            return 'yes'
        # Map empty circles (e.g., '⚪', '◦') to 'no'
        elif re.match(r'[⚪◦]', value):
            return 'no'
        # Map semi-filled or other circle types to 'maybe'
        elif re.match(r'[⊙⦾]', value):  # Add any semi-filled or other specific types here
            return 'maybe'
    return value

# Function to extract and process table data
def extract_table_with_pdfplumber_and_three_dots(pdf_url, pages='all'):
    """
    Extracts tables from a PDF using pdfplumber and processes circle types.
    
    Args:
    pdf_url (str): URL to the PDF file.
    pages (str or int): Pages from which tables are extracted. 'all' for all pages or specific page numbers.
    
    Returns:
    str: Processed table data with mapped circle values.
    """
    try:
        # Download the PDF
        response = requests.get(pdf_url)
        if response.status_code != 200:
            return f"Error: Failed to download PDF, status code: {response.status_code}"

        # Create a secure temporary file to store the PDF
        with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_pdf:
            temp_pdf.write(response.content)
            temp_pdf.flush()

            # Open the PDF using pdfplumber
            with pdfplumber.open(temp_pdf.name) as pdf:
                table_data = []
                pages_to_extract = range(len(pdf.pages)) if pages == 'all' else [int(pages) - 1]

                # Iterate through the selected pages and extract tables
                for page_num in pages_to_extract:
                    page = pdf.pages[page_num]
                    tables = page.extract_table()

                    if tables:
                        # Convert the extracted table to a DataFrame
                        df = pd.DataFrame(tables[1:], columns=tables[0])
                        # Apply the circle mapping to the DataFrame
                        df = df.applymap(map_dots_to_values)
                        table_data.append(df)

                if not table_data:
                    return "Error: No tables found in the PDF."

                # Convert the tables to a single string for output
                tables_str = "\n\n".join([table.to_string(index=False) for table in table_data])

            return tables_str

    except Exception as e:
        return f"Error: {str(e)}"

# Example usage
pdf_url = 'https://pirls2021.org/wp-content/uploads/2022/11/Exhibit%205%20Purposes%20for%20Reading%20Emphasized%20in%20the%20Intended%20Language%20Reading%20Curriculum.pdf'
table_string = extract_table_with_pdfplumber_and_three_dots(pdf_url, pages='1')
print(table_string)

Country                                                     None None Reading to Improve\nReading Skills and\nComprehension None None Reading for Literary\nExperience None None Reading to Acquire\nInformation None None Reading for Enjoyment None None
                                                         Albania                                                                                                                                                                                      
                                                       Australia                                                                                                                                                                                      
                                                         Austria                                                                                                                                                                                   