In [2]:
import os
import re
import logging
from pathlib import Path
import PyPDF2
from tqdm import tqdm
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import openai
from openai import OpenAI
from textwrap import dedent
from bert_score import score as bert_score


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Function to load PDF documents from a specified directory
def load_pdf_documents(data_dir="./data"):
    documents = []
    data_path = Path(data_dir)
    
    # Check if the data directory exists
    if not data_path.exists():
        raise FileNotFoundError(f"Data directory not found at {data_path.absolute()}")
    
    # Get all PDF files in the directory
    pdf_files = list(data_path.glob("*.pdf"))
    
    # Check if any PDF files were found
    if not pdf_files:
        raise FileNotFoundError(f"No PDF files found in {data_path.absolute()}")

    # Process each PDF file
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            text = ""
            with open(pdf_file, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() + "\n"

            documents.append({
                'filepath': str(pdf_file.absolute()),
                'filename': pdf_file.name,
                'title': pdf_file.stem,
                'text': text,
                'source': 'CMU Official Documents'
            })

        except Exception as e:
            logger.error(f"Error processing {pdf_file.name}: {str(e)}")
            continue

    return documents

# Function to clean and preprocess text
def clean_document_text(text):
    text = ' '.join(text.split())
    patterns = [r'page \d+ of \d+', r'confidential', r'©\d+']
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text


In [3]:
# Function to chunk documents into smaller pieces
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    chunks = []
    for doc in tqdm(documents, desc="Chunking documents"):
        text = doc['text']
        words = text.split()
        for i in range(0, len(words), chunk_size - chunk_overlap):
            chunk_words = words[i:i + chunk_size]
            chunk_text = ' '.join(chunk_words)
            chunks.append({
                'text': chunk_text,
                'document_title': doc['title'],
                'document_source': doc['source'],
                'chunk_id': f"{doc['title']}_{len(chunk_text)}_{hash(chunk_text)}",
                'metadata': {
                    'source': doc['source'],
                    'title': doc['title'],
                    'filepath': doc['filepath']
                }
            })
    return chunks

In [4]:
# Function to setup the vector database using ChromaDB
def setup_vector_database(chunks, collection_name="cmu_student_guide"):
    # Create a ChromaDB client
    chroma_client = chromadb.PersistentClient(
        path="./chroma_db",
        settings=Settings(anonymized_telemetry=False)
    )
    
    # Create a embedding function using SentenceTransformer
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )

    # Create collection or get existing one
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_func
    )

    # Add documents to the collection
    for i in tqdm(range(0, len(chunks), 100), desc="Indexing documents"):
        batch = chunks[i:i + 100]
        collection.add(
            documents=[chunk['text'] for chunk in batch],
            metadatas=[chunk['metadata'] for chunk in batch],
            ids=[chunk['chunk_id'] for chunk in batch]
        )

    return collection


In [5]:
# Function to retrieve relevant chunks from the vector database for a given query
def retrieve_relevant_chunks(collection, query, top_k=3):
    # Query the collection for relevant chunks
    results = collection.query(
        query_texts=[query],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    # Convert distances to similarity scores
    results['scores'] = [1 - distance for distance in results['distances'][0]]
    return results

In [6]:
# Function to generate an answer using OpenAI API based on the retrieved chunks and the query
def generate_answer(openai_client, query, retrieved_chunks):
    context = "\n\n".join([
        f"Source: {meta['title']}\n{doc}"
        for doc, meta in zip(retrieved_chunks['documents'][0],
                            retrieved_chunks['metadatas'][0])
    ])

    prompt = f"""
    You are a helpful CMU assistant. Answer based ONLY on this context:
    {context}
    Question: {query}
    Answer concisely and cite sources. If unsure, say you I don't know.
    """

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a factual CMU student assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )

    return response.choices[0].message.content

In [7]:
# Function to query the CMU knowledge base
def query_cmu_knowledge(collection, openai_client, question, top_k=3):
    try:
        retrieved_chunks = retrieve_relevant_chunks(collection, question, top_k)
        answer = generate_answer(openai_client, question, retrieved_chunks)

        return {
            "question": question,
            "answer": answer,
            "sources": retrieved_chunks['metadatas'][0]
        }
    except Exception as e:
        logger.error(f"Query failed: {str(e)}")
        return {
            "question": question,
            "answer": "Sorry, I couldn't process your question. Please contact The HUB.",
            "sources": []
        }

In [8]:
# Function to evaluate the generated answer using BERTScore
def evaluate_response(generated_answer, reference_answer):
    P, R, F1 = bert_score([generated_answer], [reference_answer], lang='en')
    return {
        "bertscore_precision": P.mean().item(),
        "bertscore_recall": R.mean().item(),
        "bertscore_f1": F1.mean().item()
    }

# Function to run the evaluation on a set of test cases
def run_evaluation(collection, openai_client, test_cases):
    results = []
    for case in test_cases:
        response = query_cmu_knowledge(collection, openai_client, case['question'])
        metrics = evaluate_response(response['answer'], case['answer'])
        results.append({
            "question": case['question'],
            "generated_answer": response['answer'],
            "reference_answer": case['answer'],
            **metrics
        })
    return pd.DataFrame(results)

In [9]:
# Setup the variables and run the code
documents = load_pdf_documents("../data")
chunks = chunk_documents(documents)
collection = setup_vector_database(chunks)
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

response = query_cmu_knowledge(
    collection,
    openai_client,
    "What is the deadline to add a course?"
)
print(f"Q: {response['question']}")
print(f"A: {response['answer']}")
print("Sources:")
for source in response['sources']:
    print(f"- {source['title']}")

# Sample evaluation
test_cases = [
    {
        "question": "How do I access library resources?",
        "answer": "Use your Andrew ID at the library website"
    }
]
evaluation_df = run_evaluation(collection, openai_client, test_cases)
print(evaluation_df)

Processing PDFs: 100%|██████████| 10/10 [00:00<00:00, 37.86it/s]
Chunking documents: 100%|██████████| 10/10 [00:00<00:00, 17239.23it/s]
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]/s]
Indexing documents: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.51it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Q: What is the deadline to add a course?
A: I don't know.
Sources:
- cds-2024-c-first-time-first-year-freshman-admission-21feb2025
- cds-2024-g-annual-expenses-21feb2025
- cds-2024-e-academic-offerings-and-policies-21feb2025


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.14it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                             question generated_answer  \
0  How do I access library resources?    I don't know.   

                            reference_answer  bertscore_precision  \
0  Use your Andrew ID at the library website             0.810589   

   bertscore_recall  bertscore_f1  
0           0.84631      0.828065  


In [None]:
# Setup OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai.api_key)

# Function to get chat response from OpenAI API using the CMU-specific RAG model
def get_chat_response(prompt, model_name="gpt-4o-mini"):
    """
    CMU-specific prompt model with:
    - Structured academic responses
    - CMU knowledge base context
    - Error handling for student queries
    """
    try:
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "system",
                    "content": """You are 'Andrew', the official CMU student assistant.
                    Provide accurate information about Carnegie Mellon University including:
                    - Academic policies
                    - Course registration
                    - Campus resources
                    - Important deadlines
                    Cite official sources when possible."""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.3
        )
        return completion.choices[0].message.content

    except Exception as e:
        return f"""I couldn't access CMU information. Please:
        1. Visit thehub.cmu.edu
        2. Contact (412) 268-8186
        Error: {str(e)}"""

response = get_chat_response("Who is Cathleen Kisak?")
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Cathleen Kisak is a faculty member at Carnegie Mellon University, known for her role in the School of Computer Science. She has been involved in various educational and administrative capacities within the university. For specific information about her current role, research interests, or contributions, I recommend checking the official Carnegie Mellon University website or the School of Computer Science faculty directory for the most up-to-date information.
