<a href="https://colab.research.google.com/github/raghavan93513/Langchain-Food-Recommender/blob/main/Langchain_Food_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Installs, Imports and API Keys

In [2]:
# RUN THIS CELL FIRST!
!pip install -q langchain==0.0.150 pypdf pandas matplotlib tiktoken textract transformers openai faiss-cpu sentence_transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/648.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/648.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m553.0/648.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.4/648.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.1/256.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   

In [4]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [5]:
os.environ["OPENAI_API_KEY"] = "sk-XXX"

# 2. Loading PDFs and chunking with LangChain

In [6]:
# Advanced method - Split by chunk

# Step 1: Convert PDF to text
import textract
doc = textract.process("./RaghavanBestFood.pdf")

# Step 2: Save to .txt and reopen (helps prevent issues)
with open('RaghavanBestFood.txt', 'w') as f:
    f.write(doc.decode('utf-8'))

with open('RaghavanBestFood.txt', 'r') as f:
    text = f.read()

def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

# Step 3: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 24,
    length_function = count_tokens,
)

chunks = text_splitter.create_documents([text])

MissingFileError: ignored

In [None]:
print("Number of Chunks:",len(chunks))

In [None]:
# Result is many LangChain 'Documents' around 500 tokens or less (Recursive splitter sometimes allows more tokens to retain context)
type(chunks[0])

In [None]:
# Create a list of token counts
token_counts = [count_tokens(chunk.page_content) for chunk in chunks]
token_counts

In [None]:
# Quick data visualization to ensure chunking was successful

# Create list of indices for x-axis
indices = range(len(token_counts))

# Create a bar chart
plt.bar(indices, token_counts)

# Labeling the bar chart
plt.xlabel('Index')
plt.ylabel('Token Counts')
plt.title('Bar Chart of Token Counts per Chunk')


# Show the plot
plt.show()

# 3. Embed text and store embeddings

In [None]:
# Get embedding model
embeddings = HuggingFaceEmbeddings()

# Create vector database
db = FAISS.from_documents(chunks, embeddings)

# 4. Query to get information about Food Recommendation

In [None]:
# Create QA chain to integrate similarity search with user queries (answer query from knowledge base)

chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

query = "What dish should I eat in India and why?"
docs = db.similarity_search(query)

chain.run(input_documents=docs, question=query)

# 5. Create Food Recommending Chatbot

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Create conversation chain that uses our vectordb as retriver, this also allows for chat history management
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.5), db.as_retriever())

In [None]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""

    if query.lower() == 'exit':
        print("Thank you for using the Raghavan's Recommended  Food list chatbot!")
        return

    result = qa({"question": query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))

    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))

print("Welcome to the Raghavan's Recommended  Food List Chatbot! Type 'exit' to stop.")

input_box = widgets.Text(placeholder='Please enter your question:')
input_box.on_submit(on_submit)

display(input_box)