# Basic RAG with LM Studio
This notebook demonstrates how to use LM Studio for a basic Retrieval-Augmented Generation (RAG)

In [9]:
import os
from dotenv import load_dotenv
import pandas as pd
from tqdm.auto import tqdm
from qdrant_client.http.exceptions import ResponseHandlingException
# from utils.vectordb_client import add_documents_with_retry
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils.llm_client import lm_studio_client
from utils.vectordb_client import get_vector_store

load_dotenv()

LM_STUDIO_MODEL = os.getenv("LM_STUDIO_MODEL", "qwen/qwen3-30b-a3b")


Load datasets and prepare them for vectorization

In [10]:
tqdm.pandas(desc="Generating Documents")
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
# Load all three datasets
# df1 = pd.read_csv('./datasets/The_Flavors_of_India.csv')
# df2 = pd.read_csv('./datasets/indian_food.csv')
df3 = pd.read_csv('./datasets/IndianFoodDataset.csv')

# Standardize column nam`es for df2
# df2 = df2.rename(columns={
#     'name': 'RecipeName',
#     'ingredients': 'Ingredients',
#     'prep_time': 'PrepTimeInMins',
#     'cook_time': 'CookTimeInMins',
#     'flavor_profile': 'FlavorProfile',
#     'course': 'Course',
#     'state': 'State',
#     'region': 'Region'
# })

# Standardize column names for df3 to match df1 structure
df3_standardized = df3.copy()
# df3 already has the right column names, but let's ensure consistency
df3_standardized = df3_standardized.rename(columns={
    'Diet': 'diet'  # Standardize the diet column name
})

# Add missing columns to df2 to match the structure
# df2['TranslatedRecipeName'] = df2['RecipeName']
# df2['TranslatedIngredients'] = df2['Ingredients']
# df2['TranslatedInstructions'] = ''
# df2['URL'] = ''
# df2['Cuisine'] = df2['Region']

# Add missing columns to df3 to match the structure
df3_standardized['FlavorProfile'] = ''
df3_standardized['State'] = ''
df3_standardized['Region'] = df3_standardized['Cuisine']
df3_standardized['Cleaned-Ingredients'] = ''
df3_standardized['image-url'] = ''
df3_standardized['Ingredient-count'] = ''

# Select common columns for a combination
common_columns = [
    'TranslatedRecipeName', 'RecipeName', 'TranslatedIngredients', 'Ingredients',
    'PrepTimeInMins', 'CookTimeInMins', 'TotalTimeInMins', 'Cuisine', 'Course',
    'diet', 'TranslatedInstructions', 'URL'
]

# Ensure all dataframes have these columns
for df in [df1, df2, df3_standardized]:
    for col in common_columns:
        if col not in df.columns:
            df[col] = ''

# Select only common columns from each dataframe
df1_subset = df1[common_columns]
df2_subset = df2[common_columns]
df3_subset = df3_standardized[common_columns]

# Combine all three datasets
combined_df = pd.concat([df1_subset, df2_subset, df3_subset], axis=0, ignore_index=True)

def create_index(row):
    name = row['TranslatedRecipeName'] if pd.notna(row['TranslatedRecipeName']) and row['TranslatedRecipeName'] != '' else row['RecipeName']
    cuisine = row['Cuisine'] if pd.notna(row['Cuisine']) and row['Cuisine'] != '' else 'Unknown'
    return f"{name}_{cuisine}"

combined_df['UniqueID'] = combined_df.apply(create_index, axis=1)
combined_df.set_index('UniqueID', inplace=True)
combined_df = combined_df.sort_index()

# Display the combined dataset info
# print(f"Combined dataset shape: {combined_df.shape}")
# print(f"Number of recipes from df1: {len(df1)}")
# print(f"Number of recipes from df2: {len(df2)}")
# print(f"Number of recipes from df3: {len(df3)}")
# print(f"Total combined recipes: {len(combined_df)}")

# Prepare data for vectorization
data = combined_df[:].progress_apply(
    lambda x: x.to_markdown(),
    axis=1
)

# combined_df.head()

Generating Documents:   0%|          | 0/6871 [00:00<?, ?it/s]

Now, Let's Create Documents with Proper Metadata and Put them in Vector Store

In [11]:
from langchain.schema import Document
def create_documents_with_metadata_from_data(data_series, df):
    """
    Create LangChain documents with proper metadata from the data series and original DataFrame
    """
    documents = []

    for idx, markdown_content in tqdm(data_series.items(), total=len(data_series), desc="Creating documents with metadata"):
        # Get the corresponding row from the original DataFrame
        row = df.loc[idx]

        # Create metadata dictionary with actual recipe fields
        metadata = {
            'TranslatedRecipeName': str(row.get('TranslatedRecipeName', '')),
            'RecipeName': str(row.get('RecipeName', '')),
            'Cuisine': str(row.get('Cuisine', '')),
            'Diet': str(row.get('diet', '')),  # Map 'diet' column to 'Diet' metadata
            'Course': str(row.get('Course', '')),
            'TotalTimeInMins': str(row.get('TotalTimeInMins', '')),
            'PrepTimeInMins': str(row.get('PrepTimeInMins', '')),
            'CookTimeInMins': str(row.get('CookTimeInMins', '')),
            'UniqueID': str(idx),  # Use the index as UniqueID
        }

        # Clean up empty values and NaN
        cleaned_metadata = {}
        for key, value in metadata.items():
            if pd.notna(value) and str(value).strip() and str(value) != 'nan':
                cleaned_metadata[key] = str(value).strip()
            else:
                cleaned_metadata[key] = ''

        documents.append(Document(page_content=markdown_content, metadata=cleaned_metadata))

    return documents


Now, Let's Chunk It using RecursiveCharacterTextSplitter and Put them in Vector Store

In [12]:
# Create documents with proper metadata from the existing data
documents = create_documents_with_metadata_from_data(data, combined_df)

print(f"Created {len(documents)} documents")
print("Sample document metadata:")
print(documents[0].metadata)
print("\nSample document content preview:")
print(documents[0].page_content[:300])

# Now optionally apply chunking if you want
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=500,
    length_function=len
)

# Split documents while preserving metadata
chunked_docs = text_splitter.split_documents(documents)
print(f"After chunking: {len(chunked_docs)} document chunks")

# Create the vector store and add documents
vector_store_unchunked = get_vector_store("my-fav-indian-food")

# Add documents to the vector store
try:
    vector_store_unchunked.add_documents(chunked_docs)
    print(f"Successfully added {len(chunked_docs)} documents to vector store")
except Exception as e:
    print(f"Error adding documents: {e}")
    # If there are issues, add in batches
    batch_size = 100
    for i in tqdm(range(0, len(chunked_docs), batch_size), desc="Adding documents in batches"):
        batch = chunked_docs[i:i + batch_size]
        try:
            vector_store_unchunked.add_documents(batch)
            print(f"Added batch {i//batch_size + 1}")
        except Exception as batch_error:
            print(f"Error adding batch {i//batch_size + 1}: {batch_error}")


Creating documents with metadata:   0%|          | 0/6871 [00:00<?, ?it/s]

Created 6871 documents
Sample document metadata:
{'TranslatedRecipeName': 'Bengali Style Cholar Dal Recipe', 'RecipeName': 'बंगाली छोलार दाल रेसिपी -  Bengali Style Cholar Dal Recipe', 'Cuisine': 'Bengali Recipes', 'Diet': 'High Protein Vegetarian', 'Course': 'Lunch', 'TotalTimeInMins': '40', 'PrepTimeInMins': '10', 'CookTimeInMins': '30', 'UniqueID': 'Bengali Style Cholar Dal Recipe_Bengali Recipes'}

Sample document content preview:
|                        |   Bengali Style Cholar Dal Recipe_Bengali Recipes                                                                                                                                                                                                                                
After chunking: 48787 document chunks
Successfully added 48787 documents to vector store


Now, it's time to query the vector store and get the context for our RAG model. And then we will use LM Studio to generate a response based on the user query and the context retrieved from the vector store.

In [14]:
user_query = "I am looking for Bengali Quick non veg dish" # input("Enter your query: ")

prompt = """
You are a helpful assistant specializing in Indian cuisine and food recipes. Based on the provided context, please answer the user's question accurately and thoroughly.

Guidelines:
- Be polite, friendly, and informative in your response
- Use the context information to provide specific details when available
- If the answer isn't found in the context, clearly state "I don't have enough information about that in my knowledge base" and offer to help with related topics you can assist with
- When discussing recipes, include relevant details like ingredients, cooking methods, regional origins, or dietary information when available

User Question: {user_query}

Context Information:
{context}

Response:
"""



_docs = vector_store_unchunked.similarity_search(
    user_query, k=5
)

context = '\n\n'.join(
    [doc.page_content for doc in _docs]
)

prompt = prompt.format(
    user_query=user_query,
    context=context,
)

response = lm_studio_client.chat.completions.create(
    model=LM_STUDIO_MODEL,
    messages=[
        {"role": "user", "content": prompt}
    ]
)

response_text = response.choices[0].message.content

print(response_text)


Sure! One of the quickest and most beloved non‑vegetarian Bengali dishes that you can whip up in about 20–25 minutes is **Chicken Jhal (চিকেন ঝাল)** – a spicy, aromatic chicken curry that’s perfect for lunch or dinner. Below is a simple, time‑saving recipe along with some quick tips and variations.

---

## Chicken Jhal (চিকেন ঝাল) – Quick Bengali Chicken Curry

| **Category** | **Details** |
|--------------|-------------|
| **Cuisine**  | Bengali (East India) |
| **Course**   | Lunch / Dinner |
| **Diet**     | High‑Protein, Non‑Vegetarian |
| **Time**     | 20–25 minutes |
| **Serves**   | 2–3 people |

### Ingredients

| Ingredient | Quantity | Notes |
|------------|----------|-------|
| Chicken (boneless, cut into bite‑size pieces) | 300–400 g | You can use thigh or breast; thighs stay juicier |
| Onion (thinly sliced) | 1 medium | Adds sweetness and depth |
| Tomato (chopped or pureed) | 2 medium | For tanginess; use ripe tomatoes |
| Green chili (slit or finely chopped) | 2–3 | A