In [1]:
from langchain.document_loaders import DirectoryLoader



# Ingest Data

In [2]:
DATA_PATH = '../data/pdf'

In [3]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    documents = loader.load()
    return documents

In [4]:
docs = load_documents()
print(f"Number of documents: {len(docs)}")
print(docs[0].metadata)

Number of documents: 1
{'source': '../data/pdf/sweden-visa-extension-2024.pdf'}


# Create Vector Embeddings Database

In [5]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [7]:
chunks = text_splitter.split_documents(docs)
len(chunks)

18

In [8]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="llama3", show_progress=True),
    collection_name="local_rag"
)

OllamaEmbeddings: 100%|██████████| 18/18 [00:50<00:00,  2.78s/it]


# Retrieval

In [24]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [25]:
local_model = "llama3"
llm = ChatOllama(model=local_model)


In [26]:
query_prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}"""
)
retriever = MultiQueryRetriever.from_llm(
    retriever=vector_db.as_retriever(),
    llm=llm,
    prompt=query_prompt,
)
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [27]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [28]:
chain.invoke("What is this document about?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]


"Based on the provided context, this document appears to be related to instructions and checklists for attestation of studies in Sweden, specifically for students who need to extend their residence permit for studies. The documents seem to focus on providing guidelines for universities and colleges in Sweden to attest to a student's progress in their studies, with the aim of supporting the student's application for a residence permit extension."

In [29]:
chain.invoke("How should a student apply for an extension?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.70it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.72it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [

'Based on the provided context, there is no direct information on how a student applies for an extension. However, it can be inferred that the student would need to attach relevant documents, such as their transcript of records from Ladok including a presentation of course modules (completed and partially completed), when applying for a residence permit. Additionally, the Swedish Migration Agency may grant residence permits for studies outside the regular programme length if the reason for delay is known but not confirmed by the higher education institution.'