# USPTO MPEP RAG Example (Step 1 document processing)

First we need to verify that the raw PDFs exist locally, if not download and save them.

In [2]:
import os
import subprocess
from openai import OpenAI

# Define directories
faiss_directory = '../data/scratch/vectordb/'
# Directory to check for txt files exist
txt_directory = '../data/scratch/txt/'

# Check if the directory exists and contains any PDFs
def check_txt_files_exist(directory):
    if not os.path.exists(directory):
        return False
    for file_name in os.listdir(directory):
        if file_name.lower().endswith('.txt'):
            return True
    return False

In [3]:
# Path to the script to run if PDFs are not found
script_path = 'scrape_mpep_from_web.py'

# Check if PDFs exist, if not run the script
if check_txt_files_exist(txt_directory):
    print("Raw text files already exist in the directory.")
else:
    print("Raw text files not found. Running the download script...")
    result = subprocess.run(['python3', script_path], capture_output=True, text=True)
    if result.returncode == 0:
        print("Script executed successfully.")
    else:
        print("Error running the script.")
        print(result.stderr)

Raw text files already exist in the directory.


In [24]:
import os
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

# Check if FAISS vector store exists
if os.path.exists(faiss_directory) and os.listdir(faiss_directory):
    # Load FAISS vector store
    db = FAISS.load_local(faiss_directory, OpenAIEmbeddings(), allow_dangerous_deserialization=True)
    print("FAISS index loaded from", faiss_directory)
else:
    # Load documents
    loader = DirectoryLoader(txt_directory, glob="**/*.txt", loader_cls=TextLoader)
    documents = loader.load()
    
    # Split documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)
    
    # Initialize embeddings
    embeddings = OpenAIEmbeddings()
    
    # Create FAISS vector store from documents
    db = FAISS.from_documents(texts, embeddings)
    
    # Save the FAISS vector store
    os.makedirs(faiss_directory, exist_ok=True)
    db.save_local(faiss_directory)
    print("FAISS index created and saved to", faiss_directory)


FAISS index loaded from ../data/scratch/vectordb/


In [28]:
retriever = db.as_retriever(search_kwargs={"k": 4})
retrieved_docs = retriever.invoke("who can sumbit a patent?")
print(retrieved_docs[0])

#https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/contextual_compression/#embeddingsfilter

page_content='2203 Persons Who May Cite Prior Art or Written Statements [R-07.2015]\nThe patent owner, or any member of the public, may submit prior art patents or printed publications and/or written statements and additional information to the Office. 35 U.S.C. 301 states that “[a]ny person at any time may cite to the Office....”\n“Any person” may be a corporate or governmental entity as well as an individual. “Any person” includes patentees, licensees, reexamination requesters, real parties in interest to the patent owner or requester, persons without a real interest, and persons acting for real parties in interest without a need to identify the real party of interest. If a person citing prior art or written statements desires his or her identity to be kept confidential, such a person need not identify himself or herself.\nPersons other than reexamination requesters who desire to remain confidential are therefore advised to not identify themselves anywhere in their papers. For reexam

In [32]:
import os
from typing import List, Dict, Any
from langchain.schema import BaseMessage, ChatResult, ChatMessage
from langchain.chat_models.base import BaseChatModel
import openai

class CustomChatOpenAI(BaseChatModel):
    def __init__(self, base_url: str, model: str, temperature: float = 0.5, top_p: float = 1.0, max_tokens: int = 1024):
        api_key = os.getenv('NVCF_KEY')
        if not api_key:
            raise ValueError("API key not found. Please set the environment variable 'NVCF_KEY'.")
        openai.api_key = api_key
        openai.api_base = base_url
        self.model = model
        self.temperature = temperature
        self.top_p = top_p
        self.max_tokens = max_tokens

    def _call_openai(self, messages: List[Dict[str, Any]]) -> str:
        completion = openai.ChatCompletion.create(
            model=self.model,
            messages=messages,
            temperature=self.temperature,
            top_p=self.top_p,
            max_tokens=self.max_tokens,
            stream=True
        )
        response = ""
        for chunk in completion:
            if chunk.choices[0].delta.get("content"):
                response += chunk.choices[0].delta["content"]
        return response

    def _generate(self, messages: List[BaseMessage]) -> ChatResult:
        formatted_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
        response = self._call_openai(formatted_messages)
        chat_message = ChatMessage(role="assistant", content=response)
        return ChatResult(messages=[chat_message])

    def _llm_type(self) -> str:
        return "custom_openai"

# Usage example
base_url = "https://integrate.api.nvidia.com/v1"
model = "mistralai/mistral-large"

custom_chat_openai = CustomChatOpenAI(base_url=base_url, model=model)

messages = [BaseMessage(role="user", content="Write a limerick about the wonders of GPU computing.")]
chat_result = custom_chat_openai._generate(messages)

for message in chat_result.messages:
    print(message.content)


ValueError: "CustomChatOpenAI" object has no field "model"