In [1]:
pip install PyPDF2 sentence-transformers faiss-cpu openai langchain pandas


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.9.0.post1


In [2]:
# Mount Google Drive
from google.colab import drive
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Mount the drive
drive.mount('/content/drive')

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Chunking the text into logical parts (e.g., sentences)
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Generate embeddings for chunks and store them
def generate_and_store_embeddings(files, embedding_model):
    chunk_metadata = []

    for file in files:
        text = extract_text_from_pdf(file)
        chunks = chunk_text(text)
        embeddings = embedding_model.encode(chunks)

        # Ensure embeddings are a 2D NumPy array
        embeddings = np.array(embeddings)
        if len(embeddings.shape) == 1:
            embeddings = embeddings.reshape(1, -1)

        # Debug: Check the shape of the embeddings
        print(f"Embedding shape for file {os.path.basename(file)}: {embeddings.shape}")

        # Initialize FAISS index after determining embedding dimension
        if not hasattr(generate_and_store_embeddings, "index"):
            embedding_dimension = embeddings.shape[1]
            generate_and_store_embeddings.index = faiss.IndexFlatL2(embedding_dimension)

        # Add embeddings to the FAISS index
        generate_and_store_embeddings.index.add(embeddings)

        # Store metadata (chunk text and file name)
        chunk_metadata.extend([(chunk, os.path.basename(file)) for chunk in chunks])

    return generate_and_store_embeddings.index, chunk_metadata


# Specify the folder in Google Drive containing the PDF files
pdf_folder = "/content/drive/MyDrive/Sithafal"  # Update with your folder path

# Check if folder exists and collect PDF files
if os.path.isdir(pdf_folder):
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
else:
    raise ValueError("Invalid folder path. Ensure the folder exists and contains PDF files.")

# Generate embeddings for the provided files
index, chunk_metadata = generate_and_store_embeddings(pdf_files, embedding_model)

# Output for verification
print(f"Processed {len(pdf_files)} PDF files.")
for metadata in chunk_metadata[:5]:  # Display a few chunks for verification
    print(f"File: {metadata[1]}, Chunk: {metadata[0][:100]}...")


  from tqdm.autonotebook import tqdm, trange


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding shape for file PDF Files.pdf: (1, 0)
Processed 1 PDF files.


In [None]:
!apt-get install tesseract-ocr
!pip install pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (9,881 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123633 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [None]:
import pytesseract
from PIL import Image
import pdfplumber
import os

# Function to extract text using OCR if text extraction fails
def extract_text_using_ocr(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            # Try to extract text first
            page_text = page.extract_text()
            if not page_text:  # If no text is found, use OCR
                print(f"Using OCR for page {pdf.pages.index(page) + 1}")
                # Convert the page to an image
                img = page.to_image()
                text += pytesseract.image_to_string(img.original)  # Apply OCR to image
            else:
                text += page_text
    return text

# Path to PDF directory
pdf_folder = "/content/drive/MyDrive/Sithafal"

# Extract text using OCR if necessary
file_path = os.path.join(pdf_folder, "PDF Files.pdf")
text = extract_text_using_ocr(file_path)
print("Extracted Text:", text[:500])  # Print the first 500 characters of the extracted text


Using OCR for page 1
Using OCR for page 2
Using OCR for page 3
Using OCR for page 4
Using OCR for page 5
Using OCR for page 6
Using OCR for page 7
Using OCR for page 8
Using OCR for page 9
Using OCR for page 10
Using OCR for page 11
Using OCR for page 12
Using OCR for page 13
Using OCR for page 14
Using OCR for page 15
Using OCR for page 16
Using OCR for page 17
Using OCR for page 18
Using OCR for page 19
Extracted Text:  

Tables, Charts, and
Graphs

with Examples from History, Economics,

Education, Psychology, Urban Affairs and
Everyday Life

REVISED: MICHAEL LOLKUS 2018
Earnings and Unemployment Rates by Educational Attainment

‘Unemployment rate in 2013 (%) Median weekly earnings in 2013 ($)

Doctoral degree
Professional degree
Master's degree
Bachelor's degree
Associate’s degree
Some college, no degree
High school diploma IC
Less than a high school diploma |Ea2Y

All workers: 6.1% All workers: $827
Source


In [None]:
!pip install camelot-py[cv]


Collecting camelot-py[cv]
  Downloading camelot_py-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting pypdf>=3.0.0 (from camelot-py[cv])
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting ghostscript>=0.7 (from camelot-py[cv])
  Downloading ghostscript-0.7-py2.py3-none-any.whl.metadata (4.4 kB)
INFO: pip is looking at multiple versions of camelot-py[cv] to determine which version is compatible with other requirements. This could take a while.
Collecting camelot-py[cv]
  Downloading camelot_py-0.10.1-py3-none-any.whl.metadata (8.3 kB)
  Downloading camelot_py-0.10.0-py3-none-any.whl.metadata (8.3 kB)
  Downloading camelot_py-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Downloading camelot_py-0.9.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: camelot-py
Successfully installed camelot-py-0.9.0


In [None]:
!pip install "pypdf2>3.0" # Downgrade PyPDF2 to a version less than 3.0.0



In [None]:
!pip install --upgrade PyPDF2
!pip install camelot-py[cv] --upgrade


Collecting camelot-py[cv]
  Using cached camelot_py-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting pypdf>=3.0.0 (from camelot-py[cv])
  Using cached pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting ghostscript>=0.7 (from camelot-py[cv])
  Using cached ghostscript-0.7-py2.py3-none-any.whl.metadata (4.4 kB)
INFO: pip is looking at multiple versions of camelot-py[cv] to determine which version is compatible with other requirements. This could take a while.
Collecting camelot-py[cv]
  Using cached camelot_py-0.10.1-py3-none-any.whl.metadata (8.3 kB)
  Using cached camelot_py-0.10.0-py3-none-any.whl.metadata (8.3 kB)


In [None]:
import pytesseract
from PIL import Image
import pdfplumber
import os

# Function to extract text using OCR if text extraction fails
def extract_text_using_ocr(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            # Try to extract text first
            page_text = page.extract_text()
            if not page_text:  # If no text is found, use OCR
                print(f"Using OCR for page {pdf.pages.index(page) + 1}")
                # Convert the page to an image
                img = page.to_image()
                text += pytesseract.image_to_string(img.original)  # Apply OCR to image
            else:
                text += page_text
    return text

# Path to PDF directory
pdf_folder = "/content/drive/MyDrive/Sithafal"

# Extract text using OCR if necessary
file_path = os.path.join(pdf_folder, "PDF Files.pdf")
text = extract_text_using_ocr(file_path)
print("Extracted Text:", text[:500])  # Print the first 500 characters of the extracted text


Using OCR for page 1
Using OCR for page 2
Using OCR for page 3
Using OCR for page 4
Using OCR for page 5
Using OCR for page 6
Using OCR for page 7
Using OCR for page 8
Using OCR for page 9
Using OCR for page 10
Using OCR for page 11
Using OCR for page 12
Using OCR for page 13
Using OCR for page 14
Using OCR for page 15
Using OCR for page 16
Using OCR for page 17
Using OCR for page 18
Using OCR for page 19
Extracted Text:  

Tables, Charts, and
Graphs

with Examples from History, Economics,

Education, Psychology, Urban Affairs and
Everyday Life

REVISED: MICHAEL LOLKUS 2018
Earnings and Unemployment Rates by Educational Attainment

‘Unemployment rate in 2013 (%) Median weekly earnings in 2013 ($)

Doctoral degree
Professional degree
Master's degree
Bachelor's degree
Associate’s degree
Some college, no degree
High school diploma IC
Less than a high school diploma |Ea2Y

All workers: 6.1% All workers: $827
Source


In [None]:
!pip uninstall PyPDF2 -y

Found existing installation: PyPDF2 3.0.1
Uninstalling PyPDF2-3.0.1:
  Successfully uninstalled PyPDF2-3.0.1


In [None]:
!pip install PyPDF2==2.11.0

Collecting PyPDF2==2.11.0
  Downloading PyPDF2-2.11.0-py3-none-any.whl.metadata (6.0 kB)
Downloading PyPDF2-2.11.0-py3-none-any.whl (220 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/220.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m153.6/220.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.2/220.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-2.11.0


In [None]:
import camelot
import PyPDF2
import os

# Extract tables using Camelot
def extract_tables_using_camelot(file_path):
    # Open the PDF using PyPDF2's PdfReader
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)  # Use PdfReader instead of PdfFileReader
        number_of_pages = len(reader.pages)

    # Now use camelot to read the PDF, specifying the number of pages
    tables = camelot.read_pdf(file_path, pages=f'1-{number_of_pages}', flavor='stream')
    print(f"Found {len(tables)} tables in the PDF.")
    return tables

# ... (rest of your code remains the same)
pdf_folder = "/content/drive/MyDrive/Sithafal"
# Path to the PDF file
file_path = os.path.join(pdf_folder, "PDF Files.pdf")
tables = extract_tables_using_camelot(file_path)
for i, table in enumerate(tables):
    print(f"Table {i+1}:\n", table.df)

# Path to the PDF file (repeating the same process)
file_path = os.path.join(pdf_folder, "PDF Files.pdf")
tables = extract_tables_using_camelot(file_path)
for i, table in enumerate(tables):
    print(f"Table {i+1}:\n", table.df)



Found 0 tables in the PDF.




Found 0 tables in the PDF.


