# LangChain Query pipeline for Riksdags data

In [None]:
%pip install python-dotenv~=1.0 docarray~=0.40.0 pypdf~=5.1 --upgrade --quiet
%pip install chromadb~=0.5.18 sentence-transformers~=3.3 --upgrade --quiet
%pip install langchain~=0.3.7 langchain_openai~=0.2.6 langchain_community~=0.3.5 langchain-chroma~=0.1.4 langchainhub~=0.1.21 --upgrade --quiet

%pip install --upgrade --quiet azure-search-documents azure-identity
# %pip install sentence-transformers # For HuggingFaceEmbeddings


In [None]:
from langchain_community.vectorstores.azuresearch import AzureSearch
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
)
from langchain_core.documents import Document
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
#from langchain_openai import ChatOpenAI
#from langchain_anthropic import ChatAnthropic
#from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
#from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
#from langchain_community.llms import GPT4All #pip install gpt4all
from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseChatModel
from langchain_openai import AzureChatOpenAI
import os


### Load environment

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

## Setup embedding model and vector store

In [None]:
#vector_store_address: str = "https://gm-dev-ai-search-lab.search.windows.net"
#vector_store_address: str = "https://ai-search-labb-apr18.search.windows.net"
#index_name: str = "riksdagen_index_uae" # Hugging face (WhereIsAI/UAE-Large-V1)
index_name: str = "tokoy24-riksdagen" # Hugging face (intfloat/multilingual-e5-large-instruct)
#index_name: str = "riksdagen_index_openai-l"
#vector_store_password: str = os.environ["AZURE_AI_SEARCH_KEY"]
#vector_store: AzureSearch
#splitter: RecursiveCharacterTextSplitter

#### Option 1 - use Azure OpenAI embeddings

In [None]:
# openai_embeddings_model: str = "text-embedding-3-small"
# embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=openai_embeddings_model)

openai_embeddings_model: str = "text-embedding-3-large"
embeddings: Embeddings = AzureOpenAIEmbeddings(model=openai_embeddings_model)

#### Option 2 - use HuggingFace embeddings (downloaded from HuggingFace model hub)

In [None]:
# See https://huggingface.co/spaces/mteb/leaderboard
# embedding_function: Embeddings = HuggingFaceEmbeddings(
#     model_name="intfloat/multilingual-e5-large-instruct" # 560M params, 2.09GB mem use, 1024 dim, 514 tokens, 63.61 avg score
#     #model_name="avsolatorio/GIST-all-MiniLM-L6-v2" # 23M params, 0.08GB mem use, 384 dim, 512 tokens, 59 avg score
#     #model_name="Salesforce/SFR-Embedding-2_R" # 7B params, 26GB mem use, 4096 dim, 32k tokens, 70.32 avg score
#     #model_name="nvidia/NV-Embed-v2" # 7B params, 29GB mem use, 4096 dim, 32k tokens, 72.31 avg score
# )
# print(f"Using HuggingFace model: {hf_embeddings_model}")

In [None]:
#embedding_function = embeddings.embed_query

#### Setup Vector Store (Azure AI Search) with custom fields

In [None]:
# fields = [
#     # Default fields
#     SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
#     SearchableField(name="content", type=SearchFieldDataType.String, searchable=True),
#     SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
#                 searchable=True,
#                 vector_search_dimensions=len(embedding_function("Text")),
#                 vector_search_profile_name="myHnswProfile",
#                 ),
#     SearchableField(name="metadata", type=SearchFieldDataType.String, searchable=True),
#     # Custom fields
#     SearchableField(name="type",type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=True),
#     SearchableField(name="from_person",type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=True),
#     SearchableField(name="from_party",type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=True),
#     SearchableField(name="to_person",type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=True),
#     SearchableField(name="to_party",type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=True),
# ]

In [None]:
print("Initializing Vector Store...")

# vector_store = AzureSearch(
#     azure_search_endpoint=vector_store_address,
#     azure_search_key=vector_store_password,
#     index_name=index_name,
#     embedding_function=embeddings.embed_query,
#     #search_type="similarity", # Using hybrid, which is default
#     fields=fields,
# )

In [None]:
import os

from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    ComplexField,
)
from langchain_community.vectorstores import AzureSearch
from langchain_community.vectorstores.azuresearch import (
    FIELDS_ID,
    FIELDS_CONTENT,
    FIELDS_CONTENT_VECTOR,
    FIELDS_METADATA, AzureSearchVectorStoreRetriever,
)

LANGUAGE = "sv.lucene"

DOCUMENT_ID: str = "document_id"
DOCUMENT_NAME: str = "document_name"
DOCUMENT_DATE: str = "document_date"
DOCUMENT_STAKEHOLDER: str = "document_stakeholder"
DOCUMENT_STAKEHOLDER_ROLE: str = "document_stakeholder_role"
DOCUMENT_STAKEHOLDER_NAME: str = "document_stakeholder_name"
DOCUMENT_STAKEHOLDER_PARTY: str = "document_stakeholder_party"
DOCUMENT_DEBATE: str = "debate"
DOCUMENT_DEBATE_SPEAKING_PARTY: str = "debate_speaking_party"
DOCUMENT_DEBATE_SPEAKER: str = "debate_speaker"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_KEY"),
    index_name=os.getenv("AZURE_SEARCH_INDEX"),
    embedding_function=embeddings,
    fields=[
        SimpleField(
            name=FIELDS_ID,
            type=SearchFieldDataType.String,
            key=True,
            filterable=True,
        ),
        SearchableField(
            name=FIELDS_CONTENT,
            type=SearchFieldDataType.String,
        ),
        SearchField(
            name=FIELDS_CONTENT_VECTOR,
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=len(embeddings.embed_query("Text")),
            vector_search_profile_name="myHnswProfile",
        ),
        SearchableField(
            name=FIELDS_METADATA,
            type=SearchFieldDataType.String,
        ),
        SearchableField(
            name=DOCUMENT_ID,
            type=SearchFieldDataType.String,
            filterable=True,
        ),
        SearchableField(
            name=DOCUMENT_NAME,
            type=SearchFieldDataType.String,
            filterable=True,
            analyzer_name=LANGUAGE
        ),
        SearchableField(
            name=DOCUMENT_DATE,
            type=SearchFieldDataType.String,
            filterable=True,
        ),
        ComplexField(
            name=DOCUMENT_STAKEHOLDER,
            collection=True,
            fields=[
                SearchableField(
                    name=DOCUMENT_STAKEHOLDER_ROLE,
                    type=SearchFieldDataType.String,
                    filterable=True,
                    analyzer_name=LANGUAGE
                ),
                SearchableField(
                    name=DOCUMENT_STAKEHOLDER_NAME,
                    type=SearchFieldDataType.String,
                    filterable=True,
                    analyzer_name=LANGUAGE
                ),
                SearchableField(
                    name=DOCUMENT_STAKEHOLDER_PARTY,
                    type=SearchFieldDataType.String,
                    filterable=True,
                    analyzer_name=LANGUAGE
                ),
            ],
        ),
        ComplexField(
            name=DOCUMENT_DEBATE,
            collection=True,
            fields=[
                SearchableField(
                    name=DOCUMENT_DEBATE_SPEAKING_PARTY,
                    type=SearchFieldDataType.String,
                    filterable=True,
                    analyzer_name=LANGUAGE
                ),
                SearchableField(
                    name=DOCUMENT_DEBATE_SPEAKER,
                    type=SearchFieldDataType.String,
                    filterable=True,
                    analyzer_name=LANGUAGE
                ),
            ],
        ),
    ],
)

# TODO: Evaluate if score threshold should be used
retriever: AzureSearchVectorStoreRetriever = vector_store.as_retriever(k=10)


### Setup the LLM to use

In [None]:
# OpenAI
# llm = ChatOpenAI(
#     #model_name="gpt-3.5-turbo",
#     model_name="gpt-4-turbo",
#     temperature=0.7,
# )

# Azure OpenAI
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
api_version = "2024-10-01-preview"
llm = AzureChatOpenAI(deployment_name="gpt-4o", temperature=0.0, openai_api_version=api_version)

# Ollama
# llm = Ollama(model="llama3", verbose=True)

# Anthropic (Claude)
# llm = ChatAnthropic(
#     model='claude-3-opus-20240229',
#     temperature=0.0,
# )

# Models from HuggingFace
# llm = HuggingFacePipeline.from_model_id(
#     #model_id="microsoft/phi-2",
#     #model_id="macadeliccc/laser-dolphin-mixtral-2x7b-dpo",
#     model_id="mistralai/Mistral-7B-Instruct-v0.2",
#     #model_id="microsoft/Phi-3-mini-4k-instruct",
#     task="text-generation",
#     pipeline_kwargs={"max_new_tokens": 10},
# )

# GPT4All (if running locally, and GPT4All is installed)
# llm = GPT4All(
#     model="/Users/tobias/Library/Application Support/nomic.ai/GPT4All/Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf",
#     max_tokens=2048,
# )
print("Initialized LLM")

### Setup the prompt template for the QA chain

In [None]:
# Build prompt
template = """Använd följande kontext för att svara på frågan i slutet. Om du inte vet svaret, säg bara att du inte vet, försök inte hitta på ett svar. Håll svaret så kortfattat som möjligt, men säkerställ att svaret innehåller all information som krävs för att besvara frågan. Svara alltid på svenska. 
    Kontext:
    ```    
    {context}
    ```
    
    Fråga:
    ``` 
    {question}
    ```
    
    Hjälpfullt svar:"""

prompt = PromptTemplate.from_template(template)

### Setup the QA retrieval chain

In [None]:
doc_count = 10

#query = "Jag är en liten kaffekanna som vill bli en stor kaffekanna. Hur blir jag en stor kaffekanna?"
query = "Vilka åtgärder föreslås för att förbättra cybersäkerheten?"
#query = "Finns det motsägelsefulla uttalanden kring NATO av Magdalena Andersson?"

#### Option 2 - use LCEL

In [None]:
def format_docs(docs):
    formatted = "\n".join(f"Dokumenttext:\n{doc.page_content}" for doc in docs)
    # formatted = "\n".join(f"Titel: {doc.metadata['title']}\nFörfattare: {doc.metadata['from_person']}\nParti: {doc.metadata['from_party']}\nDokumenttext:\n{doc.page_content}" for doc in docs)
    #print(f"Formatted docs ({len(docs)}): {formatted}")
    return formatted

qa_chain = (
   {
        "context": retriever | format_docs,
       #"context": vector_store.as_retriever(search_kwargs={"k": doc_count}) | format_docs,
       "question": RunnablePassthrough(), 
   }
        | prompt
        | llm
        | StrOutputParser()
)

In [None]:
print("Running QA chain...")
result = qa_chain.invoke(query)
print(result)