In [60]:
# !pip install sentence-transformers chromadb pypdf openai tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [61]:
# Import required libraries
import os
import re
import PyPDF2
import pandas as pd
import numpy as np
from tqdm import tqdm
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
import json
from bert_score import score as bert_score
import logging
from pathlib import Path

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Silence non-critical errors from PyPDF
logging.getLogger("PyPDF2").setLevel(logging.CRITICAL)

In [62]:
# Documents loading functions

def load_pdf_documents(data_dir="./data"):
    documents = []
    data_path = Path(data_dir)

    if not data_path.exists():
        raise FileNotFoundError(f"Data directory not found at {data_path.absolute()}")

    pdf_files = list(data_path.glob("*.pdf"))
    if not pdf_files:
        raise FileNotFoundError(f"No PDF files found in {data_path.absolute()}")

    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            text = ""
            with open(pdf_file, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() + "\n"

            documents.append({
                'filepath': str(pdf_file.absolute()),
                'filename': pdf_file.name,
                'title': pdf_file.stem,
                'text': text,
                'source': 'CMU Official Documents'
            })

        except Exception as e:
            logger.error(f"Error processing {pdf_file.name}: {str(e)}")
            continue

    return documents

def clean_document_text(text):
    text = ' '.join(text.split())
    patterns = [r'page \d+ of \d+', r'confidential', r'©\d+']
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text


In [63]:
# Chunking Functions

def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    chunks = []

    for doc in tqdm(documents, desc="Chunking documents"):
        text = doc['text']
        words = text.split()

        for i in range(0, len(words), chunk_size - chunk_overlap):
            chunk_words = words[i:i + chunk_size]
            chunk_text = ' '.join(chunk_words)

            chunks.append({
                'text': chunk_text,
                'document_title': doc['title'],
                'document_source': doc['source'],
                'chunk_id': f"{doc['title']}_{len(chunk_text)}_{hash(chunk_text)}",
                'metadata': {
                    'source': doc['source'],
                    'title': doc['title'],
                    'filepath': doc['filepath']
                }
            })

    return chunks

In [64]:
# Vector Database Functions

def setup_vector_database(chunks, collection_name="cmu_student_guide_2"):
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_func
    )

    for i in tqdm(range(0, len(chunks), 100), desc="Indexing documents"):
        batch = chunks[i:i + 100]
        collection.add(
            documents=[chunk['text'] for chunk in batch],
            metadatas=[chunk['metadata'] for chunk in batch],
            ids=[chunk['chunk_id'] for chunk in batch]
        )

    return collection


In [65]:
# Retrival Functions

def retrieve_relevant_chunks(collection, query, top_k=3):
    """Retrieve relevant document chunks for a query"""
    results = collection.query(
        query_texts=[query],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    # Convert distances to similarity scores
    results['scores'] = [1 - distance for distance in results['distances'][0]]
    return results

In [66]:
# Generation Functions

def generate_answer(openai_client, query, retrieved_chunks):
    context = "\n\n".join([
        f"Source: {meta['title']}\n{doc}"
        for doc, meta in zip(retrieved_chunks['documents'][0],
                            retrieved_chunks['metadatas'][0])
    ])

    prompt = f"""
    You are a helpful CMU assistant. Answer based ONLY on this context:

    {context}

    Question: {query}

    Answer concisely and cite sources. If unsure, say you don't know.
    """

    response = openai_client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a factual CMU student assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )

    return response.choices[0].message.content

In [67]:
# Query Pipeline

def query_cmu_knowledge(collection, openai_client, question, top_k=3):
    try:
        retrieved_chunks = retrieve_relevant_chunks(collection, question, top_k)
        answer = generate_answer(openai_client, question, retrieved_chunks)

        return {
            "question": question,
            "answer": answer,
            "sources": retrieved_chunks['metadatas'][0]
        }
    except Exception as e:
        logger.error(f"Query failed: {str(e)}")
        return {
            "question": question,
            "answer": "Sorry, I couldn't process your question. Please contact The HUB.",
            "sources": []
        }

In [68]:
# Evaluation Functions

def evaluate_response(generated_answer, reference_answer):
    P, R, F1 = bert_score([generated_answer], [reference_answer], lang='en')
    return {
        "bertscore_precision": P.mean().item(),
        "bertscore_recall": R.mean().item(),
        "bertscore_f1": F1.mean().item()
    }

def run_evaluation(collection, openai_client, test_cases):
    results = []
    for case in test_cases:
        response = query_cmu_knowledge(collection, openai_client, case['question'])
        metrics = evaluate_response(response['answer'], case['answer'])
        results.append({
            "question": case['question'],
            "generated_answer": response['answer'],
            "reference_answer": case['answer'],
            **metrics
        })
    return pd.DataFrame(results)

In [69]:
# Usage


documents = load_pdf_documents("./data")
chunks = chunk_documents(documents)
collection = setup_vector_database(chunks)
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

response = query_cmu_knowledge(
    collection,
    openai_client,
    "What is the deadline to add a course?"
)
print(f"Q: {response['question']}")
print(f"A: {response['answer']}")
print("Sources:")
for source in response['sources']:
    print(f"- {source['title']}")

# Sample evaluation
test_cases = [
    {
        "question": "How do I access library resources?",
        "answer": "Use your Andrew ID at the library website"
    }
]
evaluation_df = run_evaluation(collection, openai_client, test_cases)
print(evaluation_df)

Processing PDFs: 100%|██████████| 10/10 [00:00<00:00, 20.52it/s]
Chunking documents: 100%|██████████| 10/10 [00:00<00:00, 16156.80it/s]
Indexing documents:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Indexing documents: 100%|██████████| 1/1 [00:00<00:00,  3.28it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 403 Forbidden"
ERROR:__main__:Query failed: Error code: 403 - {'error': {'message': 'Project `proj_oS3d7Rnjgr141tDhCN7hZohr` does not have access to model `gpt-4`', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}


Q: What is the deadline to add a course?
A: Sorry, I couldn't process your question. Please contact The HUB.
Sources:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 403 Forbidden"
ERROR:__main__:Query failed: Error code: 403 - {'error': {'message': 'Project `proj_oS3d7Rnjgr141tDhCN7hZohr` does not have access to model `gpt-4`', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                             question  \
0  How do I access library resources?   

                                    generated_answer  \
0  Sorry, I couldn't process your question. Pleas...   

                            reference_answer  bertscore_precision  \
0  Use your Andrew ID at the library website             0.836391   

   bertscore_recall  bertscore_f1  
0          0.845007      0.840677  


In [75]:
# !pip install --upgrade openai
from openai import OpenAI
import os

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

def cmu_prompt_model(prompt):
    """
    CMU-specific prompt model with:
    - Structured academic responses
    - CMU knowledge base context
    - Error handling for student queries
    """
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": """You are 'Andrew', the official CMU student assistant.
                    Provide accurate information about Carnegie Mellon University including:
                    - Academic policies
                    - Course registration
                    - Campus resources
                    - Important deadlines
                    Cite official sources when possible."""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.3
        )
        return completion.choices[0].message.content

    except Exception as e:
        return f"""I couldn't access CMU information. Please:
        1. Visit thehub.cmu.edu
        2. Contact (412) 268-8186
        Error: {str(e)}"""

response = cmu_prompt_model("Who is Cathleen Kisak?")
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Cathleen Kisak is the Associate Vice President for Student Affairs at Carnegie Mellon University. She oversees various departments within Student Affairs, including the Center for Student Diversity and Inclusion, the Office of the Dean of Students, and the Office of Student Leadership, Involvement, and Civic Engagement. Her role involves supporting student success and well-being at the university. You can find more information about her on the Carnegie Mellon University website or the Student Affairs page.
