# RAG : MultiTypes Loads in Langchain

In [1]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama3")

In [2]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

## GENERATION NO CONTEXT

In [3]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt
template_1 = """
Question: {question}
"""

prompt_1 = ChatPromptTemplate.from_template(template_1)

In [4]:
retrieval_rag_chain_1 = (
    {"question":RunnablePassthrough()}
    | prompt_1
    | llm
    | StrOutputParser()
)

In [5]:
# question = 'Which team has won the soccer world cup in 1950?'

question = 'Which team has won the football world cup in 2026?'
response = retrieval_rag_chain_1.invoke(question)
print(response)

I'm afraid I don't have that information yet! The 2026 FIFA World Cup hasn't taken place yet, so no team has won it. But I'll be happy to provide you with updates and results once the tournament is held!


## INDEXING

In [6]:
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, CSVLoader, TextLoader

loader_web = WebBaseLoader("http://localhost/football.html")
loader_pdf = PyPDFLoader("loaders/football.pdf")
loader_csv = CSVLoader('loaders/football.csv')
loader_txt = TextLoader('loaders/football.txt')

loader_all = MergedDataLoader(loaders=[loader_web, loader_pdf, loader_txt, loader_csv])

all_docs = loader_all.load()

len(all_docs)

5

In [7]:
# print(all_docs)
for doc in all_docs:
    print(doc)
    print()

page_content='\n\n\n\n\nFootball World Cup 2038\n\n\n    The 2038 football world cup winner is Bahamas.\n    The 2039 football world cup winner is Germany.\n\n' metadata={'source': 'http://localhost/football.html', 'title': 'Football World Cup 2038', 'language': 'en'}

page_content='Fiji won the football World Cup in 2034.France won the football World Cup in 2035.' metadata={'source': 'loaders/football.pdf', 'page': 0}

page_content='The football world cup winner in year 2030 was Samoa.\nThe football world cup winner in year 2031 was Spain.' metadata={'source': 'loaders/football.txt'}

page_content='Competition;Winner;Year: Football World Cup;Jamaica;2026' metadata={'source': 'loaders/football.csv', 'row': 0}

page_content='Competition;Winner;Year: Football World Cup;USA;2027' metadata={'source': 'loaders/football.csv', 'row': 1}



In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents = text_splitter.split_documents(all_docs)

In [9]:
vectorstore = Chroma.from_documents(documents, embedding=GPT4AllEmbeddings())

In [10]:
len(vectorstore)

5

## RETRIEVAL

In [11]:
retriever = vectorstore.as_retriever()

## GENERATION

In [24]:
question = f"Which team has won the football world cup in 2038 ?"

In [25]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt
template = """
Answer the question based only on the following context:
{context}
The answer has to specify the source document used for the answer.

if you can not answer based on the context you must \
always respond:
"I can't answer the question from the given context" 

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [26]:
retrieval_rag_chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [27]:
response = retrieval_rag_chain.invoke(question)
print(response)

Based on the provided context, I can answer the question.

The winner of the football World Cup in 2038 is Bahamas, according to the document with metadata {'language': 'en', 'source': 'http://localhost/football.html', 'title': 'Football World Cup 2038'}.

Source: Document(page_content='Football World Cup 2038\n\n\n    The 2038 football world cup winner is Bahamas.\n    The 2039 football world cup winner is Germany.', metadata={'language': 'en', 'source': 'http://localhost/football.html', 'title': 'Football World Cup 2038'})
