In [68]:
import os, re, logging
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import PyPDF2
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from bert_score import score as bert_score
from openai import OpenAI
from dotenv import load_dotenv

In [69]:
# Load environment variables from .env file
load_dotenv()

# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("PyPDF2").setLevel(logging.CRITICAL)

In [70]:
# === PDF LOADING ===
def load_pdf_documents(data_dir="./data"):
    documents = []
    data_path = Path(data_dir)
    pdf_files = list(data_path.glob("*.pdf"))

    if not data_path.exists():
        raise FileNotFoundError(f"No such directory: {data_path.absolute()}")
    if not pdf_files:
        raise FileNotFoundError(f"No PDFs found in {data_path.absolute()}")

    for pdf_file in tqdm(pdf_files, desc="Reading PDFs"):
        try:
            with open(pdf_file, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text = "\n".join([page.extract_text() or "" for page in reader.pages])
            documents.append({
                "filepath": str(pdf_file),
                "filename": pdf_file.name,
                "title": pdf_file.stem,
                "text": text,
                "source": "CMU Official Documents"
            })
        except Exception as e:
            logger.error(f"Error in {pdf_file.name}: {e}")
    return documents


In [71]:
# === CLEANING ===
def clean_document_text(text):
    text = ' '.join(text.split())  # Whitespace fix
    patterns = [r'page \d+ of \d+', r'confidential', r'©\d+']
    for p in patterns:
        text = re.sub(p, '', text, flags=re.IGNORECASE)
    return text

In [72]:
# === CHUNKING (more semantic) ===
def chunk_documents(documents, chunk_size=800, chunk_overlap=200):
    chunks = []
    for doc in tqdm(documents, desc="Chunking documents"):
        cleaned_text = clean_document_text(doc['text'])
        words = cleaned_text.split()
        for i in range(0, len(words), chunk_size - chunk_overlap):
            chunk = words[i:i+chunk_size]
            chunk_text = ' '.join(chunk)
            chunks.append({
                'text': chunk_text,
                'document_title': doc['title'],
                'document_source': doc['source'],
                'chunk_id': f"{doc['title']}_{i}_{hash(chunk_text)}",
                'metadata': {
                    'source': doc['source'],
                    'title': doc['title'],
                    'filepath': doc['filepath']
                }
            })
    return chunks


In [73]:
# === VECTOR DB SETUP ===
def setup_vector_database(chunks, collection_name="cmu_student_guide"):
    chroma_client = chromadb.PersistentClient(
        path="../ipynb/chroma_db",
        settings=Settings(anonymized_telemetry=False)
    )
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_func
    )

    for i in tqdm(range(0, len(chunks), 100), desc="Indexing documents"):
        batch = chunks[i:i+100]
        collection.add(
            documents=[chunk['text'] for chunk in batch],
            metadatas=[chunk['metadata'] for chunk in batch],
            ids=[chunk['chunk_id'] for chunk in batch]
        )
    return collection


In [74]:
# === RETRIEVAL ===
def retrieve_relevant_chunks(collection, query, top_k=5):
    results = collection.query(
        query_texts=[query],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    results['scores'] = [1 - d for d in results['distances'][0]]
    return results

# === ANSWER GENERATION WITH CONTEXT ===
def generate_answer(openai_client, query, retrieved_chunks):
    context = "\n\n".join([
        f"Source: {meta['title']}\n{doc}"
        for doc, meta in zip(retrieved_chunks['documents'][0],
                             retrieved_chunks['metadatas'][0])
    ])
    prompt = f"""You are a helpful CMU assistant. Based only on the provided context, answer this question:

Context:
{context}

Question: {query}

Answer concisely. If you are unsure, say you don't know. Always cite source titles like (Source: [title]).
"""

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful, factual CMU student assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )
    return response.choices[0].message.content

In [75]:
# === MASTER QUERY FUNCTION ===
def query_cmu_knowledge(collection, openai_client, question, top_k=5):
    try:
        retrieved = retrieve_relevant_chunks(collection, question, top_k)
        answer = generate_answer(openai_client, question, retrieved)
        return {
            "question": question,
            "answer": answer,
            "sources": retrieved['metadatas'][0],
            "score": retrieved['scores']
        }
    except Exception as e:
        logger.error(f"Query failed: {str(e)}")
        return {
            "question": question,
            "answer": "Sorry, I couldn't find the information. Please contact The HUB.",
            "sources": []
        }

In [76]:
# === BERTSCORE EVALUATION ===
def evaluate_response(generated_answer, reference_answer):
    P, R, F1 = bert_score([generated_answer], [reference_answer], lang='en')
    return {
        "bertscore_precision": P.mean().item(),
        "bertscore_recall": R.mean().item(),
        "bertscore_f1": F1.mean().item()
    }

In [77]:
# === BATCH EVALUATION ===
def run_evaluation(collection, openai_client, test_cases):
    results = []
    for case in test_cases:
        response = query_cmu_knowledge(collection, openai_client, case['question'])
        metrics = evaluate_response(response['answer'], case['answer'])
        results.append({
            "question": case['question'],
            "generated_answer": response['answer'],
            "reference_answer": case['answer'],
            **metrics
        })
    return pd.DataFrame(results)

In [78]:
# === INSTANTIATE OPENAI CLIENT ===\
api_key = os.getenv("OPENAI_API_KEY")
print(os.getenv('OPENAI_API_KEY'))
openai_client = OpenAI(api_key=api_key)

# === EXECUTION ===
documents = load_pdf_documents("../ipynb/data")
chunks = chunk_documents(documents)
collection = setup_vector_database(chunks)

# === SAMPLE QUERY ===
sample_response = query_cmu_knowledge(collection, openai_client, "What is the deadline to add a course?")
print("Q:", sample_response["question"])
print("A:", sample_response["answer"])
print("Sources:")
for source in sample_response["sources"]:
    print(f"- {source['title']}")

# === EVALUATION ===
test_cases = [
    {
        "question": "How do I access library resources?",
        "answer": "Use your Andrew ID to log into the CMU library system at library.cmu.edu."
    }
]
evaluation_df = run_evaluation(collection, openai_client, test_cases)
print(evaluation_df)

# === CMU CHATBOT (FALLBACK RESPONSE FUNCTION) ===
def get_chat_response(prompt, model_name="gpt-4o-mini"):
    try:
        completion = openai_client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "system",
                    "content": """You are 'Andrew', the CMU student AI assistant. Provide factual information about:
                    - Academic policies
                    - Campus resources
                    - Course registration
                    - Key deadlines
                    - Try to make it concise and clear.
                    Cite CMU documents when applicable."""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0.3
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"I couldn't answer this. Try contacting The HUB at (412) 268-8186.\nError: {str(e)}"

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


Reading PDFs: 100%|██████████| 10/10 [00:00<00:00, 33.15it/s]
Chunking documents: 100%|██████████| 10/10 [00:00<00:00, 6180.82it/s]
Indexing documents:   0%|          | 0/1 [00:00<?, ?it/s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.66it/s][A
Indexing documents: 100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 73.41it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Q: What is the deadline to add a course?
A: I don't know. The provided context does not specify the deadline to add a course.
Sources:
- cds-2024-c-first-time-first-year-freshman-admission-21feb2025
- cds-2024-g-annual-expenses-21feb2025
- cds-2024-e-academic-offerings-and-policies-21feb2025
- cds-2024-b-enrollment-and-persistence-21feb2025
- cds-2024-c-first-time-first-year-freshman-admission-21feb2025


Batches: 100%|██████████| 1/1 [00:00<00:00, 67.32it/s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                             question  \
0  How do I access library resources?   

                                    generated_answer  \
0  I don't know. The provided context does not in...   

                                    reference_answer  bertscore_precision  \
0  Use your Andrew ID to log into the CMU library...             0.829536   

   bertscore_recall  bertscore_f1  
0          0.859465      0.844236  


In [79]:
# === EXAMPLE FALLBACK CALL ===
fallback_response = get_chat_response("What is the Admission phone number?")
print(fallback_response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The admission phone number for Carnegie Mellon University is (412) 268-2082. For more detailed information, you can also visit the official admissions website at [CMU Admissions](https://www.cmu.edu/admission/).
