In [11]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
import os
from pypdf import PdfReader

# --- CONFIGURATION AREA ---
PDF_FILEPATHS = [
    #Add file paths of PDFS to be converted to text here and to be used as context for RAG
    "C:\\Users\\kaila\\Downloads\\1210 Lab Syllabus - AU25.pdf"
]
# ------------------------------------------

# --- Output Configuration ---
OUTPUT_TEXT_FILE = "context.txt"
PDF_SEPARATOR = "\n\n--- DOCUMENT END ---\n\n"

# --- Functions ---
def extract_text_from_pdf(filepath: str) -> str:
    """Extracts text content from a PDF file given its local filepath."""
    with open(filepath, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or "" 
        return text

# --- Execution ---
print("Starting PDF conversion...")

# Filter out empty paths from the list
valid_paths = [p.strip() for p in PDF_FILEPATHS if p.strip()]

if not valid_paths:
    print("No valid file paths entered in the PDF_FILEPATHS list.")
else:
    all_text = []
    
    for i, filepath in enumerate(valid_paths):
        filename = os.path.basename(filepath)
        print(f"[{i + 1}/{len(valid_paths)}] Processing: {filename}...")
        
        if not os.path.exists(filepath):
            print(f"⚠️ Error: File not found at path: {filepath}")
            continue

        try:
            text = extract_text_from_pdf(filepath)
            all_text.append(text)
            
        except Exception as e:
            print(f"Critical Error processing {filename}: {e}")
    
    if all_text:
        final_corpus = PDF_SEPARATOR.join(all_text)
        
        with open(OUTPUT_TEXT_FILE, 'w', encoding='utf-8') as f:
            f.write(final_corpus)
            
        print(f"\nConversion complete!")
        print(f"Text from {len(all_text)} PDF(s) saved to **{OUTPUT_TEXT_FILE}**.")
    else:
        print("\nFailed to extract text from any provided PDF.")

Starting PDF conversion...
[1/1] Processing: 1210 Lab Syllabus - AU25.pdf...

Conversion complete!
Text from 1 PDF(s) saved to **context.txt**.


In [13]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4o-mini")

In [14]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser

In [15]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)



<img src='images/chain2.png' width="1200">

In [19]:
from langchain_community.document_loaders import TextLoader

# Fix: Specify the encoding as 'utf-8'
loader = TextLoader("context.txt", encoding="utf-8")

text_documents = loader.load()
text_documents

[Document(metadata={'source': 'context.txt'}, page_content=' \n \n1 \n \nCHEMISTRY 1210 LAB – AUTUMN 2025 \nGeneral Chemistry I Lab Syllabuson \n \nINTRODUCTION \n \nInstructional Team \nLab Supervisors: Dr. Bernice Opoku-Agyeman \nDr. Camila Fontes Neves da Silva \nLecturer: Refer to the lecture syllabus \nCourse Coordinator: Sophie White \nOffice & SLDS Contact: Holly Wheaton \nTA: (Varies with recitation/lab section) \n \nPlease see the “Contacts” page in Carmen for information on your instructors and who to contact for \nyour unique needs. \n   \nLab Supervisor \nEmail: \n \n \nLab Supervisor \nOffice Hours: \n \n \n \nOut of Class \nAssistance: \n \nchem1210labsupervisor@osu.edu \nAll emails should be sent to the email address above contacting the lab supervisors. Emails \nsent to personal accounts or via Carmen will not be responded to. \n \nMondays 2 PM – 3 PM; Wednesdays 10 AM – 11 AM; Thursdays 1 PM – 2 PM. \nNote: Lab Supervisor office hours are on Zoom (https://go.osu.edu/la

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(text_documents)

print(f"Split blog post into {len(documents)} sub-documents.")

Split blog post into 64 sub-documents.


In [22]:
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore1 = DocArrayInMemorySearch.from_documents(documents, embeddings)


chain = (
    {"context": vectorstore1.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("When are post lab assignments due?")

'Post-Lab assignments are due one week after the completion of the experiment by the start time of your lab period. If the due date coincides with a University Holiday, an extension will be applied. However, all submissions will not be accepted after 11:59 PM on Tuesday, December 9th, 2025.'

In [26]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_pinecone import PineconeVectorStore

index_name = "5234-rag-index"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [27]:
pinecone.similarity_search("Where will the real lab books be given to students?")[:3]

[Document(metadata={'source': 'context.txt'}, page_content='are required to purchase access to the current version of the lab manual through the Van -Griner website or \nthe University Bookstore. Physical manuals will be distributed in lab. \n \nVAN-GRINER / LABRIGHT: Your purchase of the digital lab manual includes access to a digital lab manual, Pre-\nLabs and Post-Labs on the LabRight platform as well as a physical lab manual you will receive in lab. LabRight Post-\nLabs are used with each experiment to grade your data, calculations, and Post -Lab questions. Students who opted \nout of CarmenBooks will be required to complete the LabRight registration form at van-griner.com/osu-chem before \nSeptember 5th to ensure no lapse in access. Upon registering here, Van-Griner will send you a purchase link and you \nwill need to complete payment. On September 9th, any student who opted out of CarmenBooks and has not registered \nand paid will lose access to LabRight. To re- gain access (and 

In [28]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What time are post lab assignments due?")

'Post-Lab assignments are due one week after the completion of the experiment by the start time of your lab period. Additionally, no Pre-Labs, Post-Labs, or notebook uploads will be accepted after 11:59 PM on Tuesday, December 9th, 2025.'