In [None]:
import os
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain_community.vectorstores.docarray.in_memory import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain_openai import OpenAI

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

## Data Load

In [None]:
import pandas as pd

csv_df = pd.DataFrame()

for i in range(1, 39):
    file = f"../data/recipe_page_{i}.csv"
    recipe_df = pd.read_csv(file, usecols=["recipe_name", "ingredients", "recipe", "tags"])
    csv_df=pd.concat([csv_df, recipe_df], ignore_index=True)
print(csv_df.shape)

# Shuffle rows
csv_df = csv_df.sample(frac=1).reset_index(drop=True)
csv_df.head()

### Make Documents

In [None]:
# from https://github.com/langchain-ai/langchain/issues/12601
# Modified to create dictionary with column name and value
from typing import Any, Iterator, List, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class BaseDataFrameLoader(BaseLoader):
    def __init__(self, data_frame: Any, *, page_content_column: Union[str, List[str]] = "text"):
        """Initialize with dataframe object.

        Args:
            data_frame: DataFrame object.
            page_content_column: Name of the column or list of column names containing the page content.
              Defaults to "text".
        """
        self.data_frame = data_frame
        self.page_content_column = page_content_column

    def lazy_load(self) -> Iterator[Document]:
        """Lazy load records from dataframe."""

        for idx, row in self.data_frame.iterrows():
            if isinstance(self.page_content_column, list):
                text = ' '.join(f'{col}:{row[col]}' for col in self.page_content_column)
            else:
                text = f'{col}:{row[self.page_content_column]}'
            metadata = row.to_dict()
            if isinstance(self.page_content_column, list):
                for col in self.page_content_column:
                    metadata.pop(col, None)
            else:
                metadata.pop(self.page_content_column, None)
            yield Document(page_content=text, metadata=metadata)

    def load(self) -> List[Document]:
        """Load full dataframe."""
        return list(self.lazy_load())


class DataFrameLoader(BaseDataFrameLoader):
    """Load `Pandas` DataFrame."""

    def __init__(self, data_frame: Any, page_content_column: Union[str, List[str]] = "text"):
        """Initialize with dataframe object.

        Args:
            data_frame: Pandas DataFrame object.
            page_content_column: Name of the column or list of column names containing the page content.
              Defaults to "text".
        """
        try:
            import pandas as pd
        except ImportError as e:
            raise ImportError(
                "Unable to import pandas, please install with `pip install pandas`."
            ) from e

        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        super().__init__(data_frame, page_content_column=page_content_column)

In [None]:
loader = DataFrameLoader(csv_df, page_content_column=["recipe_name", "ingredients", "recipe", "tags"])
docs = loader.load()
docs[0]

## Vector Store

### Embeddings

- `Alibaba-NLP/gte-large-en-v1.5`, open AI `text-embedding-3-large` emebdding model worked better so far.
-  `Alibaba-NLP/gte-large-en-v1.5` is very slow due to large size.
-  open AI `text-embedding-3-large` Can be expensive

In [None]:
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
# embedding_model = HuggingFaceEmbeddings(model_name='Alibaba-NLP/gte-large-en-v1.5', model_kwargs=dict(trust_remote_code=True))
# embedding_model = HuggingFaceEmbeddings(model_name='Snowflake/snowflake-arctic-embed-m-long', model_kwargs=dict(trust_remote_code=True))

from langchain_openai.embeddings import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model='text-embedding-3-large')

### Store

In [None]:
# db = DocArrayInMemorySearch.from_documents(
#     docs, 
#     embedding_model
# )
# db = Chroma.from_documents(docs, embedding_model)

In [None]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(docs, embedding_model)

In [None]:
# Exmaple
# query = "Show me all recipes which uses Yellow moong" #"Show me all shaak recipe" # "Show me all dhokla recipe" #"Recipes which has besan as ingredients"
# # query = "Show me all shaak recipes dish. Must avoid to look at word which are not shaak or shak"
query = f"Can you share recipe for onion, potato and peas. Don't include shaak or sabji?"
searched_docs = db.similarity_search_with_relevance_scores(query, k=4)
for d in searched_docs:
    print(d)
    print("----")

In [None]:

queries = ["Show me all recipes which uses Yellow moong", 
           "Show me all shaak recipe", 
           "Show me all shaak recipes dish. Must avoid to look at word which are not shaak or shak",
           "Show me all dhokla recipe",
           "Recipes which has besan as ingredients",
           "I have dudhi/doodhi at home. Can you find me a recipe?",
           "I am craving for dosa. Can you show me recipes?",
           "Can you share recipe for onion, potato and peas. Don't include shaak or sabji?"
          ]
for q in queries:
    print(f"Query: {q}")
    searched_docs = db.similarity_search_with_relevance_scores(q, k=4)
    for d in searched_docs:
        print(d)
        print("----")

## LangChain

### LLMs

In [None]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.llms import Ollama

# llm = HuggingFaceEndpoint(
#     repo_id="stabilityai/stablelm-3b-4e1t", #"meta-llama/Meta-Llama-3-8B",
#     task="text-generation",
#     max_new_tokens=512,
#     do_sample=False,
#     repetition_penalty=1.03,
#     timeout=600
# )

# q = "Provide all the recipe names, its recipe and all the ingrediants that uses besan. Do not provide or generate anything else."
# response = index.query(q, llm=llm)
# display(Markdown(response))

# from langchain_openai import OpenAI
# llm = OpenAI(model="gpt-3.5-turbo-instruct")

llm = Ollama(model="llama3", temperature=0.5)

In [None]:
# Example
qdocs = "".join([searched_docs[i][0].page_content for i in range(len(searched_docs))])
final_query = f"{qdocs} Question: {query} in a table in markdown and summarize each one"
response = llm.invoke(final_query)
display(Markdown(response))

### Chain

In [None]:
from langchain.chains.retrieval_qa.base import RetrievalQA
import langchain

langchain.debug = False

retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.2})
# For above steps, create LangChain chain
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    verbose=True,
    retriever=retriever, 
)

In [None]:
# Hellucinating with similar looking shak recipe. Like Shakarpara. But this is very wide query.
# prompt = f"Show me all shaak recipes dish. Must avoid to look at word which are not shaak or shak. Do not include Shakarpara" 

# prompt = f"Show me all recipes which uses Yellow moong"

# prompt = f"I have dudhi/doodhi at home. Can you find me a recipe?" # Hellucinating big time!
# prompt = f"I am craving for dosa. Can you show me recipes?" # which are not farali or for upvas?"
# prompt = f"Can you share recipe for onion, potato and peas. Exclude shaak or sabji?"
prompt = f"Show me one dhokla recipe"
response = qa_stuff.run(prompt + " List all of the ingredients and the whole recipe to make from the document. Present in good format")
display(Markdown(response))

In [None]:
for p in queries:
    print(f"***Prompt: {p}")
    response = qa_stuff.run(p + " List all of the ingredients and the whole recipe to make from the document. Present in good format")
    print(display(Markdown(response)))

In [None]:
response

## Eval

In [None]:
from langsmith import Client

In [None]:
client = Client()

# Define dataset: these are your test cases
dataset_name = "QA Example Dataset"
dataset = client.create_dataset(dataset_name)

In [None]:
client.create_examples(
    inputs=[
        # {"question": "Show me all recipes which uses Yellow moong"},
        # {"question": "Show me all shaak recipe"},
        {"question": "Show me one dhokla recipe"},
        # {"question": "Recipes which has besan as ingredients"},
        {"question": "I have dudhi/doodhi at home. Can you find me a recipe?"}
        # {"question": "I am craving for dosa. Can you show me recipes?"},
    ],
    outputs=[
         {"answer": "**Soft and Tasty Khaman Dhokla**\n\n**Ingredients:**\n\n* 2 cups besan (bengal gram flour)\n* 2 green chillies, chopped\n* 2 tbsp sugar\n* Salt to taste\n* 5 to 6 curry leaves (kadi patta)\n* 1 tsp citric acid (nimbu ka phool)\n* 1 tsp baking soda\n* 1/4 cup freshly grated coconut\n* 1/4 cup chopped coriander (dhania)\n\n**Recipe:**\n\n1. Combine the besan, salt and sugar in a bowl, add little water and mix well using hands.\n2. Add the citric acid and mix well.\n3. Add the baking soda, while stirring continuously for 1 minute.\n4. Pour this solution in a greased plate and steam for 8-10 mins or till the dokla is cooked.\n5. Heat the oil in a small pan and add the mustard seeds, curry leaves and green chillies.\n6. Pour this tempering to the dhoklas and keep aside.\n7. Heat 1/2 cup water and 1 tbsp sugar in a pan and pour the solution on it.\n8. Garnish with coriander and grated coconut."},
         {"answer": "Mixed Vegetable Handvo, Instant Gujarati Handvo recipe - How to make Mixed Vegetable Handvo, Instant Gujarati Handvo	For The Handvo Batter:  2 cups readymade idli batter  1/4 cup grated bottle gourd (doodhi / lauki)  1/4 cup finely chopped spinach (palak)  2 tbsp grated onions  2 tbsp grated carrot  2 tbsp finely chopped coriander (dhania)  1 tsp sugar  1 tsp green chilli paste  1/2 tsp garlic (lehsun) paste  1/4 tsp turmeric powder (haldi)  1/2 tsp chilli powder  salt to taste  Other Ingredients For Mixed Vegetable Handvo:  6 tsp oil  1 1/2 tsp mustard seeds ( rai / sarson)  1 1/2 tsp sesame seeds (til)  3 pinches of asafoetida (hing)  For Serving With Mixed Vegetable Handvo:  green chutney	For the handvo batter Combine all the ingredients along with approx. 2 tbsp of water in a deep bowl and mix well. Keep aside. How to proceed To make  mixed vegetable handvo  , divide the handvo batter into 3 equal portions and keep aside. Heat 2 tsp of oil in a small non-stick pan, add ½ tsp of mustard seeds, ½ tsp of sesame seeds and a pinch of asafoetida and sauté on a medium flame for 30 seconds. Pour a portion of the prepared handvo batter and spread it evenly, cover with a lid and cook on a medium flame for 3 to 4 minutes or till it turns golden brown in colour. Turnover and again cover with a lid and cook on a medium flame for 3 to 4 minutes or till it turns golden brown in colour from the other side as well. Remove on a chopping board, cut the   into 4 equal pieces. Repast steps 2 to 5 to make 2 more  mixed vegetable handvos . Serve the handvo  immediately with green chutney."}
    ],
    dataset_id=dataset.id,
)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator

_PROMPT_TEMPLATE = """You are an expert gujarati chef to find out whether the recipies are correct or not.
You are checking the following recipe requests:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
Respond with GOOD or BAD:
Grade:
"""

PROMPT = PromptTemplate(
    input_variables=["query", "answer", "result"], template=_PROMPT_TEMPLATE)

eval_llm = ChatOpenAI(temperature=0.0)

qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm, "prompt": PROMPT})

In [None]:
from langsmith.schemas import Run, Example

def evaluate_length(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    required = example.outputs.get("answer") or ""
    score = int(len(prediction) < 2 * len(required))
    return {"key":"length", "score": score}

In [None]:
def recipe_app(prompt):
    return qa_stuff.run(prompt + " List all of the ingredients and the whole recipe to make from the document. Present in good format")

In [None]:
def langsmith_app(inputs):
    output = recipe_app(inputs["question"])
    return {"output": output}

In [None]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    langsmith_app, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    experiment_prefix="ollama3", # A prefix for your experiment names to easily identify them
)