# USPTO MPEP RAG Example (Step 1 document processing)

First we need to verify that the raw PDFs exist locally, if not download and save them.

In [12]:
import os
import subprocess
from openai import OpenAI

# Directory to check for txt files exist
txt_directory = '../data/scratch/txt/'

# Check if the directory exists and contains any PDFs
def check_txt_files_exist(directory):
    if not os.path.exists(directory):
        return False
    for file_name in os.listdir(directory):
        if file_name.lower().endswith('.txt'):
            return True
    return False

def get_chat_response(prompt):
    """
    Sends a prompt to the chat model and returns the response.

    Parameters:
    prompt (str): The input prompt to send to the chat model.

    Returns:
    str: The response from the chat model.
    """
    api_key = os.getenv('NVCF_KEY')
    if not api_key:
        raise ValueError("API key not found. Please set the NVCF_KEY environment variable.")

    client = OpenAI(
        base_url="https://integrate.api.nvidia.com/v1",
        api_key=api_key
    )

    completion = client.chat.completions.create(
        model="mistralai/mistral-large",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        top_p=1,
        max_tokens=1024,
        stream=True
    )

    response = ""
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            response += chunk.choices[0].delta.content

    return response

def summarize_text(text)

    prompt = f"""
    You are a highly advanced language model. Your task is to summarize the following document while retaining all critical information and context. The summary should include key points, important details, and any relevant context to ensure the summarized content is as useful and informative as the original document. Please focus on the main ideas, significant data, and essential concepts. Avoid unnecessary details and redundancy.

    Document to Summarize:

    {text}

    Summary:
    """

    response = get_chat_response(prompt)
    return response

In [13]:
# Path to the script to run if PDFs are not found
script_path = 'scrape_mpep_from_web.py'

# Check if PDFs exist, if not run the script
if check_txt_files_exist(txt_directory):
    print("Raw text files already exist in the directory.")
else:
    print("Raw text files not found. Running the download script...")
    result = subprocess.run(['python3', script_path], capture_output=True, text=True)
    if result.returncode == 0:
        print("Script executed successfully.")
    else:
        print("Error running the script.")
        print(result.stderr)

Raw text files already exist in the directory.


In [18]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk

# Download the necessary NLTK data files
nltk.download('punkt')

# Directory containing the text files
txt_directory = '../data/scratch/txt/'

# Predefined token limit for chunking
token_limit = 1000  # You can adjust this limit as needed

def count_tokens(text):
    """
    Counts the number of tokens in the given text using NLTK's word_tokenize.
    
    Args:
    text (str): The text content to be tokenized.
    
    Returns:
    int: The number of tokens in the text.
    """
    tokens = word_tokenize(text)
    return len(tokens)

def should_chunk_file(file_path, token_limit):
    """
    Determines if the file should be chunked based on the token count.
    
    Args:
    file_path (str): Path to the text file.
    token_limit (int): The token limit for chunking.
    
    Returns:
    tuple: Number of tokens and boolean indicating if chunking is needed.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        token_count = count_tokens(content)
        return token_count, token_count > token_limit

# List to store file data
file_data = []

# Iterate over each file in the directory and collect data
for file_name in os.listdir(txt_directory):
    file_path = os.path.join(txt_directory, file_name)
    if os.path.isfile(file_path):
        token_count, needs_chunking = should_chunk_file(file_path, token_limit)
        file_data.append({
            "filename": file_name,
            "num_tokens": token_count,
            "needs_chunking": needs_chunking
        })

# Create a DataFrame from the collected data
df = pd.DataFrame(file_data)


[nltk_data] Downloading package punkt to /home/workbench/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
df.head()

Unnamed: 0,filename,num_tokens,needs_chunking
0,s2263.txt,55,False
1,s2720.txt,2075,True
2,s2262.txt,956,False
3,s2272.txt,1535,True
4,s2612.txt,71,False


In [20]:
df['num_tokens'].max()

80897

In [None]:
prompt = f"Please correct the content provide, removing extra spaces, missing spaces, and other formatting errors. ONLY respond with the improved content, DO NOT include any additional text. Content: {pages[0]}"
response = get_chat_response(prompt)
print(response)

In [25]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

In [30]:
loader = DirectoryLoader('../data/scratch/txt/', glob="**/*.txt", loader_cls=TextLoader)
documents = loader.load()
len(documents)

800

In [31]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(texts, embeddings)

ModuleNotFoundError: No module named 'langchain_openai'