In [None]:
import os

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain_community.vectorstores.docarray.in_memory import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain_openai import OpenAI

In [None]:
file = "../data/recipe_page_1.csv"
loader = CSVLoader(file_path=file, csv_args={"fieldnames": ["recipe_name", "ingredients", "recipe", "tags"]})

In [None]:
# loader.load()

In [None]:
from langchain.indexes import VectorstoreIndexCreator

In [None]:
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])

### Ollama instruction
- Install ollama: `curl -fsSL https://ollama.com/install.sh | sh`
- Then run `ollama run <model name>` Model name from `https://ollama.com/library`

In [None]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.llms import Ollama

# llm = HuggingFaceEndpoint(
#     repo_id="stabilityai/stablelm-3b-4e1t", #"meta-llama/Meta-Llama-3-8B",
#     task="text-generation",
#     max_new_tokens=512,
#     do_sample=False,
#     repetition_penalty=1.03,
#     timeout=600
# )

q = "Provide all the recipe names, its recipe and all the ingrediants that uses besan. Do not provide or generate anything else."

from langchain_community.llms import Ollama
llm = Ollama(model="llama3")
response = index.query(q, llm=llm)
display(Markdown(response))

## Detailed Step by Step

### Data Loader

In [None]:
import pandas as pd

csv_df = pd.DataFrame()

for i in range(1, 39):
    file = f"../data/recipe_page_{i}.csv"
    recipe_df = pd.read_csv(file, usecols=["recipe_name", "ingredients", "recipe", "tags"])
    csv_df=pd.concat([csv_df, recipe_df], ignore_index=True)
print(csv_df.shape)
csv_df.head()

In [None]:
# Shuffle rows
csv_df = csv_df.sample(frac=1).reset_index(drop=True)
csv_df.head()

In [None]:
# from https://github.com/langchain-ai/langchain/issues/12601
# Modified to create dictionary with column name and value
from typing import Any, Iterator, List, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class BaseDataFrameLoader(BaseLoader):
    def __init__(self, data_frame: Any, *, page_content_column: Union[str, List[str]] = "text"):
        """Initialize with dataframe object.

        Args:
            data_frame: DataFrame object.
            page_content_column: Name of the column or list of column names containing the page content.
              Defaults to "text".
        """
        self.data_frame = data_frame
        self.page_content_column = page_content_column

    def lazy_load(self) -> Iterator[Document]:
        """Lazy load records from dataframe."""

        for idx, row in self.data_frame.iterrows():
            if isinstance(self.page_content_column, list):
                text = ' '.join(f'{col}:{row[col]}' for col in self.page_content_column)
            else:
                text = f'{col}:{row[self.page_content_column]}'
            metadata = row.to_dict()
            if isinstance(self.page_content_column, list):
                for col in self.page_content_column:
                    metadata.pop(col, None)
            else:
                metadata.pop(self.page_content_column, None)
            yield Document(page_content=text, metadata=metadata)

    def load(self) -> List[Document]:
        """Load full dataframe."""
        return list(self.lazy_load())


class DataFrameLoader(BaseDataFrameLoader):
    """Load `Pandas` DataFrame."""

    def __init__(self, data_frame: Any, page_content_column: Union[str, List[str]] = "text"):
        """Initialize with dataframe object.

        Args:
            data_frame: Pandas DataFrame object.
            page_content_column: Name of the column or list of column names containing the page content.
              Defaults to "text".
        """
        try:
            import pandas as pd
        except ImportError as e:
            raise ImportError(
                "Unable to import pandas, please install with `pip install pandas`."
            ) from e

        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        super().__init__(data_frame, page_content_column=page_content_column)

In [None]:
# from langchain_community.document_loaders.dataframe import DataFrameLoader
# loader = CSVLoader(file_path=file, csv_args={"fieldnames": ["recipe_name", "ingredients", "recipe"]})
loader = DataFrameLoader(csv_df, page_content_column=["recipe_name", "ingredients", "recipe", "tags"])
docs = loader.load()
docs[0]

 ### Emebeddings

##### TODO:
- Try out open ai ada model: https://platform.openai.com/docs/guides/embeddings/frequently-asked-questions 
- https://github.com/langchain-ai/langchain/issues/2442
- Try  Faiss, pinecone, milvus Vdb
- Check different document formats.

In [None]:
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings()

# individual_docs = []
# for d in docs:
#     individual_docs.append(d.page_content)

# custom_embed = embedding_model.embed_documents(individual_docs)
# len(custom_embed), len(custom_embed[0])

In [None]:
# from langchain_openai.embeddings import OpenAIEmbeddings
# os.environ["OPENAI_API_KEY"] = '<KEY>'
# embedding_model = OpenAIEmbeddings()

### Vector Store

<b> Doc retrival Improvements </b>

- https://archive.ph/yn9Lr

In [None]:
from langchain_chroma import Chroma
# db = DocArrayInMemorySearch.from_documents(
#     docs, 
#     embedding_model
# )
# db = Chroma.from_documents(docs, embedding_model)

In [None]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(docs, embedding_model)

In [None]:
query = "Show me all recipes which uses Yellow moong" #"Show me all shaak recipe" # "Show me all dhokla recipe" #"Recipes which has besan as ingredients"
query = "Show me all shaak recipes dish. Must avoid to look at word which are not shaak or shak"

In [None]:
searched_docs = db.similarity_search_with_relevance_scores(query, k=4)

In [None]:
for d in searched_docs:
    print(d)
    print("----")

In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.45})
retriever.invoke(query)

### Generation using LLMs

In [None]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.llms import Ollama

# llm = HuggingFaceEndpoint(
#     repo_id="stabilityai/stablelm-3b-4e1t", #"meta-llama/Meta-Llama-3-8B",
#     task="text-generation",
#     max_new_tokens=512,
#     do_sample=False,
#     repetition_penalty=1.03,
#     timeout=600
# )

# q = "Provide all the recipe names, its recipe and all the ingrediants that uses besan. Do not provide or generate anything else."
llm = Ollama(model="llama3", temperature=0.5)
# response = index.query(q, llm=llm)
# display(Markdown(response))

# from langchain_openai import OpenAI
# llm = OpenAI(model="gpt-3.5-turbo-instruct")

In [None]:
qdocs = "".join([searched_docs[i][0].page_content for i in range(len(searched_docs))])
final_query = f"{qdocs} Question: {query} in a table in markdown and summarize each one"

In [None]:
response = llm.invoke(final_query)

In [None]:
display(Markdown(response))

### Langchain

In [None]:
from langchain.chains.retrieval_qa.base import RetrievalQA
import langchain

langchain.debug =False

retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.5})
# For above steps, create LangChain chain
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    verbose=True,
    retriever=retriever, 
)

# Hellucinating with similar looking shak recipe. Like Shakarpara. But this is very wide query.
# prompt = f"Show me all shaak recipes dish. Must avoid to look at word which are not shaak or shak. Do not include Shakarpara" 

# prompt = f"Show me all recipes which uses Yellow moong"

# prompt = f"I have dudhi/doodhi at home. Can you find me a recipe?" # Hellucinating big time!
# prompt = f"I am craving for dosa. Can you show me recipes?" # which are not farali or for upvas?"
prompt = f"Can you share recipe for onion, potato and peas. Don't include shaak or sabji?"
response = qa_stuff.run(prompt + " List all of the ingredients and the whole recipe to make from the document. Present in good format")
display(Markdown(response))