In [None]:
!pip install pdfplumber pandas pdf2image pytesseract pathlib langchain regex pypdf python-dotenv typing pinecone-client google-colab uuid glob PyMuPDF Pillow

In [None]:
!pip install pdfplumber

In [None]:
!pip install pdf2image PyMuPDF pinecone pypdf

Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pinecone
  Downloading pinecone-7.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pypdf
  Using cached pypdf-5.6.0-py3-none-any.whl.metadata (7.2 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone-7.2.0-py3-none-any.whl (524 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pdfplumber
import pandas as pd
from pdf2image import convert_from_path # Not directly used, but imported
#import pytesseract # Not directly used, but imported
from pathlib import Path # Not directly used, but imported
from langchain.schema import Document
import re
import os # Not directly used, but imported
import platform # Not directly used, but imported
import fitz  # PyMuPDF
from typing import List, Dict, Any
from pinecone import Pinecone
from google.colab import userdata
from pypdf import PdfReader
import uuid
from langchain.text_splitter import RecursiveCharacterTextSplitter # Import text splitter
import time
import glob

def extract_section_header(text: str) -> str:
    """Extract section header from text if present."""
    match = re.match(r'^([\d\.]+\s+[A-Z][^\n]+)', text)
    return match.group(1) if match else ""

def process_table(table: List[List[str]], page_num: int, section: str) -> Document:
    """Process a table and create a Document object."""
    try:
        # Convert table to DataFrame, handling empty tables
        if not table or not table[0]:
            return None

        df = pd.DataFrame(table[1:], columns=table[0])

        # Create searchable content
        page_content = f"""
        Section: {section}
        Table Contents:
        Column Headers: {', '.join(str(col) for col in df.columns)}
        Summary: Table with {df.shape[0]} rows and {df.shape[1]} columns
        Data Preview: {df.to_string()}
        """

        # Only include numerical summary if possible
        try:
            numerical_summary = df.describe().to_string()
            page_content += f"\nStatistical Summary: {numerical_summary}"
        except:
            pass

        metadata = {
            'type': 'table',
            'page_number': page_num,
            'section': section,
            'column_headers': list(df.columns),
            'row_count': len(df),
            'raw_data': df.to_dict(),
            'content_type': 'structured_data'
        }

        return Document(page_content=page_content, metadata=metadata)

    except Exception as e:
        print(f"Error processing table on page {page_num}: {e}")
        return None

def extract_and_process_table_content(pdf_path: str) -> List[Document]:
    """Extract and process all content from PDF."""
    documents = []
    current_section = ""

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                print(f"Processing page {page_num}...")

                # Extract text
                text = page.extract_text() or ""

                # Check for section header
                section_header = extract_section_header(text) # Call the standalone function
                if section_header:
                    current_section = section_header

                # Process tables
                tables = page.extract_tables()
                for table in tables:
                    doc = process_table(table, page_num, current_section) # Call the standalone function
                    if doc:
                        documents.append(doc)

    except Exception as e:
        print(f"Error in extract_and_process_content: {e}")
        raise

    return documents

def is_bold(span):
    """Check if the font is bold based on font name."""
    return "Bold" in span["font"] or "bold" in span["font"].lower()

def find_bold_figure_pages(pdf_path):
    doc = fitz.open(pdf_path)
    figure_pages = []
    figure_number = []
    figure_dict = {}
    figure_pattern = re.compile(r'^Figure\s+\d+-\d+', re.IGNORECASE)
    figure_pattern_num = re.compile(r'^Figure\s+(\d+)-(\d+)', re.IGNORECASE)
    for page_num in range(len(doc)):
        page = doc[page_num]
        text_dict = page.get_text("dict")
        for block in text_dict["blocks"]:
            if "lines" in block:
                for line in block["lines"]:
                    line_text = "".join([span["text"] for span in line["spans"]]).strip()
                    # Check if line starts with "Figure" and matches the pattern
                    if figure_pattern.match(line_text):
                        # Check if the first span (where "Figure" is) is bold
                        first_span = line["spans"][0]
                        match_fig = figure_pattern_num.match(line_text)
                        x = match_fig.group(1)
                        y = match_fig.group(2)
                        fig_id = (f"figure_{x}_{y}")
                        if is_bold(first_span):
                            figure_pages.append(page_num)
                            figure_number.append(fig_id)
                            figure_dict[page_num] = fig_id
                            break  # Only need to find one per page
    return figure_pages, figure_number, figure_dict



def split_single_page_pdf(input_pdf_path: str, page_number_to_split: int, output_pdf_path: str):
    """
    Splits a specific page from a PDF and saves it as a new PDF.

    Args:
        input_pdf_path: Path to the input PDF file.
        page_number_to_split: The page number (1-based index) to split.
        output_pdf_path: Path to save the output PDF file (containing the single page).
    """
    try:
        # Open the input PDF file
        doc = fitz.open(input_pdf_path)

        # Convert the 1-based page number to a 0-based index
        page_index_to_split = page_number_to_split

        # Check if the requested page index is valid
        if 0 <= page_index_to_split < doc.page_count:
            # Create a new blank PDF document
            new_doc = fitz.open()

            # Add the desired page to the new document
            # We use the 0-based index here
            new_doc.insert_pdf(doc, from_page=page_index_to_split, to_page=page_index_to_split)

            # Save the new document (containing only the split page)
            new_doc.save(output_pdf_path)

            print(f"Page {page_number_to_split} successfully split and saved to {output_pdf_path}")

            # Close the documents
            new_doc.close()
        else:
            print(f"Error: Page number {page_number_to_split} is out of range for the PDF (total pages: {doc.page_count}).")

        doc.close()

    except FileNotFoundError:
        print(f"Error: Input PDF file not found at {input_pdf_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


def extract_figures(pdf_path, output_folder, dpi=200):
    os.makedirs(output_folder, exist_ok=True)
    figure_pages = find_bold_figure_pages(pdf_path)
    doc = fitz.open(pdf_path)

    for page_num in figure_pages:
      #output_pdf_path = #.pdf path name
      split_single_page_pdf(pdf_path,page_num,output_pdf_path)
      page = doc[page_num]
      zoom = dpi / 72
      pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
      image_path = os.path.join(output_folder, f"{page_num + 1}.png")
      pix.save(image_path)




In [None]:
from google.colab import userdata

In [None]:
links_dict = load_links_from_csv('/content/link_doc.csv')

In [None]:
#Code which converts to pinecone embeddings
# Set the GOOGLE_API_KEY environment variable
PINECONE_API_KEY = userdata.get('PINECONE_API_SHIVAM')
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "multilinguale5"
# = 0


def pipecone_pdf_upsert(pdf_path: str):
    file_name = os.path.basename(pdf_path)
    file_name = file_name.removesuffix('.pdf')
    #i = i+1
    #print(i)
    print(file_name)
    simplified_key = file_name.split(' ')[0].lower()
    print(f"{file_name} = {links_dict[simplified_key]}")
    doc_link = links_dict[simplified_key]

    ### Create index if not exist
    if not pc.has_index(index_name):
        pc.create_index_for_model(
            name=index_name,
            cloud="aws",
            region="us-east-1",
            embed={
                #"model":"llama-text-embed-v2",
                "model":"multilingual-e5-large",
                #"model":"pinecone-sparse-english-v0",
                "field_map":{"text": "text"}
            }
        )

    #Use the index
    index = pc.Index(index_name)

    # --- New code to process PDF ---



    # Read the PDF file
    try:
        reader = PdfReader(pdf_path)
        number_of_pages = len(reader.pages)
        pdf_text = ""
        for page_num in range(number_of_pages):
            page = reader.pages[page_num]
            pdf_text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        pdf_text = None
    except Exception as e:
        print(f"Error reading PDF: {e}")
        pdf_text = None

    doc_intro = pdf_text[:500]
    # Prepare data for Pinecone if PDF was read successfully
    data_to_upsert = []
    if pdf_text:
        # Initialize the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,  # Define the size of each chunk
            chunk_overlap=200  # Define the overlap between chunks
        )

        # Split the text into chunks
        chunks = text_splitter.split_text(pdf_text)

        # Create a record for each chunk
        for i, chunk in enumerate(chunks):
            # Generate a unique ID for each chunk
            record_id = f"{file_name}_{i}"
            data_to_upsert.append({"id": record_id, "text": chunk, "doc_detail" : doc_intro, "doc_link" : links_dict[simplified_key]}, )

        # Upsert the data into the index in batches
        batch_size = 50
        for i in range(0, len(data_to_upsert), batch_size):
            batch = data_to_upsert[i:i+batch_size]
            if batch:
                index.upsert_records(
                    namespace="example-namespace",
                    records=batch
                )
                print(f"Upserted batch {i//batch_size + 1}/{(len(data_to_upsert) + batch_size - 1) // batch_size}")
                time.sleep(10)  # Pause for 10 seconds


        if not data_to_upsert:
            print("No data to upsert after processing PDF.")
        else:
            print(f"Finished upserting data from {pdf_path} to Pinecone index.")

    # --- End of new code ---


In [None]:
# to extract the figures in the pdf and put it to a folder
pdf_files = glob.glob('/content/*.pdf')
i=0
for current_pdf_path in pdf_files:
  i=i+1
  #print(current_pdf_path)
  file_name = os.path.basename(current_pdf_path)
  file_name = file_name.removesuffix('.pdf')
  #print(file_name)
  simplified_key = file_name.split(' ')[0].lower()
  print(f"{i},{file_name} = {links_dict[simplified_key]}")
  #extract_figures()

1,swru616b = https://www.ti.com/lit/pdf/SWRU616
2,awr2944p = https://www.ti.com/lit/pdf/SWRS318
3,drv8000-q1 = https://www.ti.com/lit/pdf/SLVSH22
4,drv8334 = https://www.ti.com/lit/pdf/SLVSHC7
5,iwrl6432w datasheet = https://www.ti.com/lit/pdf/SWRS311
6,slvaff1 = https://www.ti.com/lit/pdf/SLVAFF1
7,mmwave_sensors = https://www.ti.com/lit/pdf/SPRADM3
8,spradm3 = https://www.ti.com/lit/pdf/SPRADM3
9,awrl6844 = https://www.ti.com/lit/pdf/SWRS325
10,spradn9 = https://www.ti.com/lit/pdf/SPRADN9
11,slvucz2 = https://www.ti.com/lit/pdf/SLVUCZ2
12,f29h850tu = https://www.ti.com/lit/pdf/SPRSP93
13,am263p4_datasheet = https://www.ti.com/lit/pdf/SPRSP81
14,drv8462 = https://www.ti.com/lit/pdf/SLOSE79
15,spradd8 = https://www.ti.com/lit/pdf/SPRADD8
16,drv8889-q1 = https://www.ti.com/lit/ds/symlink/drv8889-q1.pdf
17,xWRLx432 Technical Ref Manusl = https://www.ti.com/lit/pdf/SWRU599
18,spru566t = https://www.ti.com/lit/pdf/SPRU566
19,awr2544 = https://www.ti.com/lit/pdf/SWRS314
20,drv8242-q1 = http

In [None]:
i = 0
pdf_files = glob.glob('/content/*.pdf')
for current_pdf_path in pdf_files:
  i=i+1
  print(f"File no = {i}")
  print(current_pdf_path)
  pipecone_pdf_upsert(current_pdf_path)

File no = 1
/content/swru616b.pdf
swru616b
swru616b = https://www.ti.com/lit/pdf/SWRU616
Upserted batch 1/2
Upserted batch 2/2
Finished upserting data from /content/swru616b.pdf to Pinecone index.
File no = 2
/content/awr2944p.pdf
awr2944p
awr2944p = https://www.ti.com/lit/pdf/SWRS318
Upserted batch 1/4
Upserted batch 2/4
Upserted batch 3/4
Upserted batch 4/4
Finished upserting data from /content/awr2944p.pdf to Pinecone index.
File no = 3
/content/drv8000-q1.pdf
drv8000-q1
drv8000-q1 = https://www.ti.com/lit/pdf/SLVSH22
Upserted batch 1/8
Upserted batch 2/8
Upserted batch 3/8
Upserted batch 4/8
Upserted batch 5/8
Upserted batch 6/8
Upserted batch 7/8
Upserted batch 8/8
Finished upserting data from /content/drv8000-q1.pdf to Pinecone index.
File no = 4
/content/drv8334.pdf
drv8334
drv8334 = https://www.ti.com/lit/pdf/SLVSHC7
Upserted batch 1/4
Upserted batch 2/4
Upserted batch 3/4
Upserted batch 4/4
Finished upserting data from /content/drv8334.pdf to Pinecone index.
File no = 5
/conte

In [None]:
import os

def load_links_from_csv(csv_filepath: str) -> dict:
    """
    Reads a simple two-column CSV file (name,link) and returns a dictionary.
    The key in the dictionary is the simplified name (lowercase, up to first space),
    and the value is the link.
    """
    links_dict = {}
    try:
        with open(csv_filepath, 'r') as f:
            # Skip header if present, or handle it as data if not.
            # Assuming the first line is a header for robustness.
            header = f.readline() # Read the first line, assuming it's a header
            # If you know there's no header, remove the line above and the check below

            for line in f:
                line = line.strip() # Remove leading/trailing whitespace
                if line: # Ensure line is not empty
                    parts = line.split(',', 1) # Split only at the first comma
                    if len(parts) == 2:
                        doc_name_in_csv = parts[0].strip()
                        link = parts[1].strip()
                        # Create the simplified key for the dictionary
                        simplified_key = doc_name_in_csv.split(' ')[0].lower()
                        links_dict[simplified_key] = link
                    else:
                        print(f"Warning: Skipping line with unexpected format: {line}")
    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_filepath}")
        return {} # Return empty dictionary if file not found
    except Exception as e:
        print(f"An error occurred while reading the CSV: {e}")
        return {}
    return links_dict

In [None]:
print(links_dict)

{'am263p4_datasheet': 'https://www.ti.com/lit/pdf/SPRSP81', 'awr2544': 'https://www.ti.com/lit/pdf/SWRS314', 'awr2944p': 'https://www.ti.com/lit/pdf/SWRS318', 'awrl6844': 'https://www.ti.com/lit/pdf/SWRS325', 'drv8000-q1': 'https://www.ti.com/lit/pdf/SLVSH22', 'drv8242-q1': 'https://www.ti.com/lit/pdf/SLVSGY7', 'drv8334': 'https://www.ti.com/lit/pdf/SLVSHC7', 'drv8462': 'https://www.ti.com/lit/pdf/SLOSE79', 'drv8889-q1': 'https://www.ti.com/lit/ds/symlink/drv8889-q1.pdf', 'f29h850tu': 'https://www.ti.com/lit/pdf/SPRSP93', 'iwrl6432w': 'https://www.ti.com/lit/pdf/SWRS311', 'mmwave_sensors': 'https://www.ti.com/lit/pdf/SPRADM3', 'slvaei3': 'https://www.ti.com/lit/an/slvaei3/slvaei3.pdf', 'slvaff1': 'https://www.ti.com/lit/pdf/SLVAFF1', 'slvucz2': 'https://www.ti.com/lit/pdf/SLVUCZ2', 'spradd8': 'https://www.ti.com/lit/pdf/SPRADD8', 'spradm3': 'https://www.ti.com/lit/pdf/SPRADM3', 'spradn9': 'https://www.ti.com/lit/pdf/SPRADN9', 'spru566t': 'https://www.ti.com/lit/pdf/SPRU566', 'spruiy2a'

In [None]:
pipecone_pdf_upsert('/content/spradn9.pdf')

spradn9
spradn9 = https://www.ti.com/lit/pdf/SPRADN9
Upserted batch 1/1
Finished upserting data from /content/spradn9.pdf to Pinecone index.


In [None]:
print(pdf_text[:500])

Application Brief
Optimize EPS System with C2000 F29 MCU
Susmitha Bumadi
The automotive market is ever evolving and always looking for more innovations with reliable and space-saving 
designs in safety-critical market niches, such as electric power steering (EPS). In today’s new cars, electric 
power steering is a standard and advanced technology for highly automated driving. Power steering systems 
were introduced commercially in 1951 as a means to reduce driver’s effort in steering a vehicle. 


In [None]:
    pdf_path = '/content/spradn9.pdf'
    file_name = os.path.basename(pdf_path)
    file_name = file_name.removesuffix('.pdf')
    print(file_name)
    # Read the PDF file
    try:
        reader = PdfReader(pdf_path)
        number_of_pages = len(reader.pages)
        pdf_text = ""
        for page_num in range(number_of_pages):
            page = reader.pages[page_num]
            pdf_text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        pdf_text = None
    except Exception as e:
        print(f"Error reading PDF: {e}")
        pdf_text = None

spradn9


In [None]:
#RETRIEVAL (NON langchain)
index = pc.Index(index_name)
query = "what is electric power steering?"

results = index.search(
    namespace="example-namespace",
    query={
        "inputs": {"text": query},
        "top_k": 3
    }
)

print(results)

{'result': {'hits': [{'_id': 'spradn9_0',
                      '_score': 0.52262943983078,
                      'fields': {'doc_detail': 'Application Brief\n'
                                               'Optimize EPS System with C2000 '
                                               'F29 MCU\n'
                                               'Susmitha Bumadi\n'
                                               'The automotive market is ever '
                                               'evolving and always looking '
                                               'for more innovations with '
                                               'reliable and space-saving \n'
                                               'designs in safety-critical '
                                               'market niches, such as '
                                               'electric power steering (EPS). '
                                               'In today’s new cars, '
                  