In [1]:
import os 
import sys
base_dir = os.path.abspath('')
curr_dir = os.path.dirname(base_dir)
if not curr_dir in sys.path: sys.path.append(curr_dir)

In [2]:
from glob import glob
from langchain_core.runnables import RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from chains.utils import format_docs
from prompt import BASIC_CHAT_TEMPLATE
from langchain_core.runnables import RunnablePassthrough
from langchain_chroma import Chroma

In [3]:
files = glob("../../data/text files/*www.stevens.edu off-campus-employment*")
files

['../../data/text files\\https_  www.stevens.edu off-campus-employment isss-f1-students cap-gap-extension.txt',
 '../../data/text files\\https_  www.stevens.edu off-campus-employment isss-f1-students cpt-work-authorization.txt',
 '../../data/text files\\https_  www.stevens.edu off-campus-employment isss-f1-students opt-work-authorization.txt',
 '../../data/text files\\https_  www.stevens.edu off-campus-employment isss-f1-students other-off-campus-employment.txt',
 '../../data/text files\\https_  www.stevens.edu off-campus-employment isss-f1-students stem-extension.txt',
 '../../data/text files\\https_  www.stevens.edu off-campus-employment-0.txt']

In [4]:
out = []

for file in files:
    with open(file, "r") as fp:
        out.append(fp.read())

In [8]:
import pandas as pd
pd.DataFrame({"text": out}).to_csv("../data/off_campus.csv")

In [15]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

In [16]:
model_kwargs = {
    "trust_remote_code": True,
    # "device": "cpu"
    }
encode_kwargs={"normalize_embeddings": True}
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [8]:
docs = [
    Document(page_content=text)
    for text in out
]
# vector = Chroma.from_documents(docs, embeddings, collection_name="off_campus_employment", persist_directory="../data/chromadb")
vector = Chroma(collection_name="off_campus_employment", persist_directory="../data/chromadb", embedding_function=embeddings)

In [19]:
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

In [24]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | BASIC_CHAT_TEMPLATE
    | llm
    | StrOutputParser()
)
retriever = vector.as_retriever(
    search_type="similarity", search_kwargs={"k": 5}
)
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)


In [25]:
rag_chain_with_source.invoke("What is CPT?")

{'context': [Document(page_content="Curricular Practical Training (CPT) is work authorization that is granted to eligible F-1 students so that they may complete the requirements of a course or the Cooperative (Co-Op) Education Program.\nStudents must meet the eligibility requirements listed below to apply for CPT work authorization:\nBe a full-time student in F-1 status for at least one academic year (one fall and one spring semester)\nBe in lawful F-1 status at the time of the CPT application\nGet approval from Academic Advisor\nBe enrolled in the practicum course necessitating practicum employment or the Cooperative Education Program (Co-op)\nHave a job offer that is directly related to your major\nHave a GPA of 3.0 or above (graduate students) or 2.0 or above (undergraduate students). Students should contact the Stevens Career Center for GPA requirements for the Cooperative Education Program.\nBe otherwise enrolled in a full course of study (9 credits for graduate students, 12 credi