In [2]:
import os
import sys
import re
import json
import openai 
from os.path import join, dirname
from dotenv import load_dotenv

load_dotenv()

# Get the API key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    print("Please set the environment variable OPENAI_API_KEY.")
    sys.exit(1)

# Read the question from the command line
if len(sys.argv) != 2:
    print("Usage: python script.py '<question>'")
    sys.exit(1)

path = sys.argv[1]
question = sys.argv[2]

Usage: python script.py '<question>'


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [19]:
import sys
import fitz
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

def load_pdf(file_path):
    """Load text from a PDF file using PyMuPDF."""
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    """Chunk the text using semantic chunking. Modify gradient threshold and min_chunk_size to improve results."""
    text_splitter = SemanticChunker(
                OpenAIEmbeddings(), 
                breakpoint_threshold_type="gradient",
                breakpoint_threshold_amount=.94,
                min_chunk_size=1000
    )
    chunks = text_splitter.split_text(text)
    return chunks

path = 'ExampleCo - NDA - John Appleseed.pdf'

# Load the PDF text
text = load_pdf(path)

# Chunk the text
chunks = chunk_text(text)



In [None]:
#Print the chunks for testing
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*80}\n")

In [None]:
#save the chunks to database: we want to be able to perform hybrid search for both vector similarity and BM25

import sqlite3
from sqlalchemy import create_engine, Column, Integer, String, LargeBinary, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the SentenceTransformer model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create a SQLite database engine
engine = create_engine('sqlite:///text_chunks.db', echo=True)
Base = declarative_base()

# Define the TextChunk model
class TextChunk(Base):
    __tablename__ = 'text_chunks'
    id = Column(Integer, primary_key=True)
    text = Column(String)
    embedding = Column(LargeBinary)

# Create the table
Base.metadata.create_all(engine)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

def save_chunks_to_db(chunks):
    """Save text chunks and their embeddings to the database."""
    for chunk in chunks:
        embedding = model.encode([chunk])[0]
        embedding_bytes = embedding.tobytes()
        text_chunk = TextChunk(text=chunk, embedding=embedding_bytes)
        session.add(text_chunk)
    session.commit()

def hybrid_search(query, top_k=5):
    """Perform hybrid search over both vector embeddings and keywords."""
    # Generate the query embedding
    query_embedding = model.encode([query])[0]
    
    # Fetch all chunks from the database
    chunks = session.query(TextChunk).all()
    
    # Calculate cosine similarity for vector search
    similarities = []
    for chunk in chunks:
        chunk_embedding = np.frombuffer(chunk.embedding, dtype=np.float32)
        similarity = cosine_similarity([query_embedding], [chunk_embedding])[0][0]
        similarities.append((chunk, similarity))
    
    # Sort by similarity (vector search)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Perform keyword search
    keyword_matches = [chunk for chunk, _ in similarities if query.lower() in chunk.text.lower()]
    
    # Combine results (hybrid search)
    hybrid_results = keyword_matches + [chunk for chunk, _ in similarities if chunk not in keyword_matches]
    
    # Return top_k results
    return hybrid_results[:top_k]


# Save chunks to the database
save_chunks_to_db(chunks)



In [42]:

# Perform a hybrid search
query = "is this contract under california law"
results = hybrid_search(query)

# Print the results
for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result.text}\n{'-'*80}\n")

2024-11-01 11:14:59,086 INFO sqlalchemy.engine.Engine SELECT text_chunks.id AS text_chunks_id, text_chunks.text AS text_chunks_text, text_chunks.embedding AS text_chunks_embedding 
FROM text_chunks
2024-11-01 11:14:59,088 INFO sqlalchemy.engine.Engine [cached since 1377s ago] ()
Result 1:
This Agreement shall be governed by and construed in accordance with the laws
of the State of California without regard to the conflicts of law principles thereof.
--------------------------------------------------------------------------------

Result 2:
15. The Recipient shall be responsible for any breach of the provisions of this
Agreement by the representatives of the Recipient (including, without limitation, any directors,
officers and employees to whom Confidential Information is disclosed pursuant to Section 3). 16. This Agreement shall be governed by and construed in accordance with the laws
of the State of California without regard to the conflicts of law principles thereof. This
Agreement h

In [43]:
#Now answer the user question with the provided context

from openai import OpenAI
import os

openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_response(base_prompt, user_question, context):
    # Combine the base prompt, context, and user question into a single prompt
    
    user_prompt = f"Context:\n{context}\n\nUser Question: {user_question}"
    
    client = OpenAI()

    response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": base_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=200,
            temperature=0.2
    )
    return response.choices[0].message #response.choices[0].text.strip()


# Define the base prompt
base_prompt = "You are a helpful assistant. Use the following context to answer the user's question."

# Define the user's question
user_question = "Is this contract under california law?"

# Define the context  (taking the top 1 here for simplicity, tune for pricing/precision)
context = results[0].text
print ("the context used is:", context)

# Generate the response
response = generate_response(base_prompt, user_question, context)

# Print the response
print("Generated Response:\n", response)

the context used is: This Agreement shall be governed by and construed in accordance with the laws
of the State of California without regard to the conflicts of law principles thereof.
Generated Response:
 ChatCompletionMessage(content='Yes, the contract is governed by California law. The agreement explicitly states that it shall be governed by and construed in accordance with the laws of the State of California.', role='assistant', function_call=None, tool_calls=None, refusal=None)
