# Course Recommendation System

In [1]:
#!python -m pip install langchain-community langchainhub langchain-chroma langchain langchain-experimental --quiet

In [2]:
#!python -m pip install pypdf faiss-cpu --quiet

### Setup and Imports

In [3]:
import os
import pandas as pd
import logging
from typing import List, Tuple, Optional

from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.vectorstores import Chroma


  from .autonotebook import tqdm as notebook_tqdm


### Logging Setup


In [4]:

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)


In [5]:

embedding_model_name = "text-embedding-3-small"
model_name = "gpt4o"

### Initialize Embedding and llm Model

In [6]:

PINECONE_API_KEY=pcsk_3sQW5Q_D5rWa1uWzhFCrXyLeupVQgjKn8onPJ7PVVtdb45bvkw9Aa4Yb9cA9gMdvD9T7qddef init_openai_clients() -> Tuple[AzureOpenAIEmbeddings, AzureChatOpenAI]:
    """Initialize Azure OpenAI embeddings and chat models."""
    api_key = os.getenv("AZURE_OPENAI_API_KEY")
    endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    version = os.getenv("AZURE_OPENAI_API_VERSION")

    if not api_key or not endpoint:
        raise EnvironmentError("Azure OpenAI credentials are not set.")

    embeddings = AzureOpenAIEmbeddings(
    model=embedding_model_name,
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-01"),
    )

    llm = AzureChatOpenAI(
        model=model_name,
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-01"),
        temperature=0
    )
    return embeddings, llm

In [7]:
embeddings, llm=init_openai_clients()

In [8]:
embeddings

AzureOpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x79654cfda240>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x79654cabde80>, model='text-embedding-3-small', dimensions=None, deployment=None, openai_api_version='2024-08-01-preview', openai_api_base=None, openai_api_type='azure', openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=2048, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True, azure_endpoint='https://eastus.api.cognitive.microsoft.com/', azure_ad_token=None, azure_ad_token_provider=None, azure_ad_async_token_provider=None, validate_base_url=Tr

In [9]:
llm

AzureChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x79654cabe900>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x79654cabeed0>, root_client=<openai.lib.azure.AzureOpenAI object at 0x79654ca19a30>, root_async_client=<openai.lib.azure.AsyncAzureOpenAI object at 0x79654cabe930>, model_name='gpt4o', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), disabled_params={'parallel_tool_calls': None}, azure_endpoint='https://eastus.api.cognitive.microsoft.com/', openai_api_version='2024-08-01-preview', openai_api_type='azure')

### Load Dataset

In [10]:

def load_dataset(url: str) -> pd.DataFrame:
    """Load dataset and prepare text field."""
    df = pd.read_csv(url)
    if df.empty:
        raise ValueError("Dataset is empty.")
    df["text"] = df["title"] + " " + df["description"]
    return df


In [11]:
url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
courses=load_dataset(url)
courses.head(5)

Unnamed: 0,course_id,title,description,text
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...,Foundations of Machine Learning Understand fou...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...,Deep Learning with TensorFlow and Keras Explor...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...,Natural Language Processing Fundamentals Dive ...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...,Computer Vision and Image Processing Learn the...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...,Reinforcement Learning Basics Get introduced t...


### Build VectorStore

In [12]:
def build_vectorstore(df: pd.DataFrame, embeddings: AzureOpenAIEmbeddings) -> Chroma:
    """Build vectorstore using Chroma."""
    return Chroma.from_texts(
        texts=df["text"].tolist(),
        embedding=embeddings,
        metadatas=[
            {"course_id": cid, "title": t}
            for cid, t in zip(df["course_id"], df["title"])
        ],
        collection_name="courses"
    )


In [13]:
vectorstore = build_vectorstore(courses, embeddings)

2025-10-01 12:02:15,066 | INFO | Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-10-01 12:02:20,276 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-08-01-preview "HTTP/1.1 200 OK"


In [14]:
# Peek directly into stored documents
print("📚 Sample documents from vectorstore:")
for i, doc in enumerate(vectorstore._collection.get()["documents"][:5]):  # show first 5
    print(f"\n--- Document {i+1} ---")
    print(doc[:300], "...")  # print first 300 chars


📚 Sample documents from vectorstore:

--- Document 1 ---
Foundations of Machine Learning Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on la ...

--- Document 2 ---
Deep Learning with TensorFlow and Keras Explore neural network architectures using TensorFlow and Keras frameworks. This course covers feedforward networks, convolutional neural networks, recurrent neural networks, and transfer learning. Learn to build, train, evaluate, and optimize deep learning mo ...

--- Document 3 ---
Natural Language Processing Fundamentals Dive into NLP techniques for processing and understanding human language. You will learn tokenization, stemming, lemmatization, part-of-speech tagging, named entity recognition, and sentiment analysis. The course includes transformer architectures, att

In [15]:
print(vectorstore._collection.count())  # how many docs stored?

docs = vectorstore._collection.get()
for i, (doc, meta) in enumerate(zip(docs["documents"], docs["metadatas"])):
    print(f"{i+1}: {meta['course_id']} - {meta['title']}")


25
1: C001 - Foundations of Machine Learning
2: C002 - Deep Learning with TensorFlow and Keras
3: C003 - Natural Language Processing Fundamentals
4: C004 - Computer Vision and Image Processing
5: C005 - Reinforcement Learning Basics
6: C006 - Data Engineering on AWS
7: C007 - Cloud Computing with Azure
8: C008 - DevOps Practices and CI/CD
9: C009 - Containerization with Docker and Kubernetes
10: C010 - APIs and Microservices Architecture
11: C011 - Big Data Analytics with Spark
12: C012 - SQL for Data Analysis
13: C013 - NoSQL Databases and MongoDB
14: C014 - Data Visualization with Tableau
15: C015 - Business Intelligence with Power BI
16: C016 - Python Programming for Data Science
17: C017 - R Programming and Statistical Analysis
18: C018 - Product Management Essentials
19: C019 - Agile and Scrum Mastery
20: C020 - User Experience (UX) Design Principles
21: C021 - Cybersecurity Fundamentals
22: C022 - Internet of Things (IoT) Development
23: C023 - Blockchain Technology and Smart Con

In [16]:
query = "Data Analytics"
results = vectorstore.similarity_search(query, k=2)

print("\n Sample query results:")
for i, doc in enumerate(results, start=1):
    print(f"\nResult {i}")
    print("Course Title:", doc.metadata.get("title", "N/A"))
    print("Course ID:", doc.metadata.get("course_id", "N/A"))
    print("Snippet:", doc.page_content[:200], "...")


2025-10-01 12:02:21,651 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-08-01-preview "HTTP/1.1 200 OK"



 Sample query results:

Result 1
Course Title: Big Data Analytics with Spark
Course ID: C011
Snippet: Big Data Analytics with Spark Process and analyze large datasets using Apache Spark and PySpark. The course covers RDDs, DataFrames, Spark SQL, and MLlib for machine learning at scale. You’ll learn cl ...

Result 2
Course Title: Data Visualization with Tableau
Course ID: C014
Snippet: Data Visualization with Tableau Transform raw data into compelling visual stories using Tableau. Learn to connect to diverse data sources, create interactive dashboards, and apply best practices in ch ...


### Build Query Chain
''' This step is added to refine the user input so that llm can construct query from user input in standard format for vectore database'''

In [17]:

def build_query_chain(llm: AzureChatOpenAI) -> LLMChain:
    """LLM chain for rewriting user queries."""
    query_prompt = PromptTemplate(
        input_variables=["profile", "completed"],
        template="""
Rewrite the following learner information into a standardized search query.

Completed courses: {completed}
User interests: {profile}

Make it short, specific, and include both past learning and new interests.
"""
    )
    return LLMChain(llm=llm, prompt=query_prompt)


In [18]:
query_chain = build_query_chain(llm)
query_chain

  return LLMChain(llm=llm, prompt=query_prompt)


LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['completed', 'profile'], input_types={}, partial_variables={}, template='\nRewrite the following learner information into a standardized search query.\n\nCompleted courses: {completed}\nUser interests: {profile}\n\nMake it short, specific, and include both past learning and new interests.\n'), llm=AzureChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x79654cabe900>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x79654cabeed0>, root_client=<openai.lib.azure.AzureOpenAI object at 0x79654ca19a30>, root_async_client=<openai.lib.azure.AsyncAzureOpenAI object at 0x79654cabe930>, model_name='gpt4o', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), disabled_params={'parallel_tool_calls': None}, azure_endpoint='https://eastus.api.cognitive.microsoft.com/', openai_api_version='2024-08-01-preview', openai_api_type='azure'), output

### Identify top K Course Recommendations using similarity search from chorma db

In [19]:
def recommend_courses(
    profile: str,
    completed_ids: Optional[List[str]],
    df: pd.DataFrame,
    vectorstore: Chroma,
    query_chain: LLMChain,
    top_k: int = 5
) -> List[Tuple[str, float, str]]:
    """Generate top-k course recommendations with deduplication."""
    if not profile and not completed_ids:
        raise ValueError("Profile and completed courses cannot both be empty.")

    # Get completed course titles for rewriting
    completed_titles = df[df["course_id"].isin(completed_ids or [])]["title"].tolist()
    
    # Use LLM to rewrite into a standardized query
    rewritten_query = query_chain.run({
        "profile": profile or "",
        "completed": ", ".join(completed_titles) if completed_titles else "None"
    })
    logger.info(f"🔎 Rewritten Query: {rewritten_query}")

    # Search more than needed (to allow deduplication)
    results = vectorstore.similarity_search_with_score(rewritten_query, k=top_k * 3)

    # Deduplicate by course_id
    seen = set()
    recs = []
    for doc, score in results:
        cid = doc.metadata["course_id"]
        if cid not in seen and cid not in (completed_ids or []):
            seen.add(cid)
            recs.append((cid, score, doc.metadata["title"]))
        if len(recs) >= top_k:
            break

    return recs


In [20]:

profile = "data"  # vague query
completed_ids = []

recs = recommend_courses(profile, completed_ids, courses, vectorstore, query_chain)
print("\nTop Recommendations:")
for cid, score, title in recs:
    print(f"  {cid} | {title} | score={score:.4f}")




  rewritten_query = query_chain.run({
2025-10-01 12:02:27,216 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/gpt4o/chat/completions?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:02:27,226 | INFO | 🔎 Rewritten Query: "Completed courses: None; Interests: data"
2025-10-01 12:02:28,205 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-08-01-preview "HTTP/1.1 200 OK"



Top Recommendations:
  C012 | SQL for Data Analysis | score=1.3674
  C017 | R Programming and Statistical Analysis | score=1.4019
  C011 | Big Data Analytics with Spark | score=1.4146
  C006 | Data Engineering on AWS | score=1.4337
  C001 | Foundations of Machine Learning | score=1.4523


### Evaluation Section

In [21]:


test_profiles = [
    ("I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.", ["C101"]),
    ("I know Azure basics and want to manage containers and build CI/CD pipelines.", ["C202"]),
    ("My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.", ["C303"]),
    ("I want to learn to build and deploy microservices with Kubernetes—what courses fit best?", ["C404"]),
    ("I’m interested in blockchain and smart contracts but have no prior experience.", []),
]

results_table = []

for profile, completed in test_profiles:
    recs = recommend_courses(profile, completed, courses, vectorstore, query_chain)
    rec_ids = [cid for cid, _, _ in recs]
    rec_titles = [title for _, _, title in recs]

    results_table.append({
        "User Query": profile,
        "Completed Courses": completed,
        "Recommended Course IDs": rec_ids,
        "Recommended Titles": rec_titles
    })

eval_df = pd.DataFrame(results_table)


2025-10-01 12:02:28,993 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/gpt4o/chat/completions?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:02:28,995 | INFO | 🔎 Rewritten Query: "Completed: Python Programming for Data Science; Interests: Data visualization; Search for advanced data visualization courses."
2025-10-01 12:02:29,273 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:02:29,830 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/gpt4o/chat/completions?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:02:29,832 | INFO | 🔎 Rewritten Query: "User with basic Azure knowledge seeks courses on container management and CI/CD pipeline development."
2025-10-01 12:02:30,109 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com

In [22]:
eval_df

Unnamed: 0,User Query,Completed Courses,Recommended Course IDs,Recommended Titles
0,I’ve completed the ‘Python Programming for Dat...,[C101],"[C016, C011, C017, C004, C014]","[Python Programming for Data Science, Big Data..."
1,I know Azure basics and want to manage contain...,[C202],"[C007, C009, C008, C006, C025]","[Cloud Computing with Azure, Containerization ..."
2,My background is in ML fundamentals; I’d like ...,[C303],"[C001, C025, C003, C002, C011]","[Foundations of Machine Learning, MLOps: Produ..."
3,I want to learn to build and deploy microservi...,[C404],"[C009, C007, C010, C008, C006]","[Containerization with Docker and Kubernetes, ..."
4,I’m interested in blockchain and smart contrac...,[],"[C023, C021, C024, C003, C001]","[Blockchain Technology and Smart Contracts, Cy..."


### Till Now we are only doing:
'''User Input about Completed Courses and profile information -> LLM query rewriting → embedding similarity search (Chroma) → top-k results.'''

We can enhance our model to ensure better response even if user input does not contain all required inputs. 
Query Expansion with LLM

Instead of just rewriting, expand the query into multiple semantically related sub-queries, then merge results.

Example: User says "data".

Expanded queries:

"data analysis"

"data science"

"data visualization"

"data engineering"

In addition to this, we can make recommendation system robust by adding following Improvements:


1. Unique Top-K Recommendations → avoid duplicates across queries.

2. Query Expansion → if user input is incomplete/vague, the LLM will enrich it with context before embedding search.

3. Deduplication Logic → ensures different users or repeated queries don’t get the same narrow set of courses.

4. Robustness Enhancements:

Handle missing/incomplete input (e.g., “data science” alone → expand to “completed basic Python, want advanced data science visualization”).

5. Retry mechanism if embedding/vector search fails.

6. Normalize scores.

7. Evaluation: automatically test all 5 queries, enforce unique results, and add LLM-based scoring commentary (1–5 relevance).

In [27]:
# Query Chains (Rewrite + Expand)

def build_query_rewriter(llm: AzureChatOpenAI) -> LLMChain:
    """LLM chain to rewrite query into standardized format."""
    prompt = PromptTemplate(
        input_variables=["profile", "completed"],
        template="""
You are an assistant for a course recommender system.
Rewrite the learner's info into a clear search query.

Completed courses: {completed}
User interests: {profile}

If input is vague or incomplete, expand it with reasonable assumptions
(e.g., "data science" -> "completed basics, looking for ML and visualization").
Return a single concise enriched query.
"""
    )
    return LLMChain(llm=llm, prompt=prompt)

In [28]:
def build_eval_chain(llm: AzureChatOpenAI) -> LLMChain:
    """LLM chain to score relevance of recommendations."""
    eval_prompt = PromptTemplate(
        input_variables=["query", "recommendations"],
        template="""
You are evaluating a course recommendation system.

User Query: {query}
Recommended Courses: {recommendations}

Rate how relevant these courses are on a scale of 1 (poor) to 5 (excellent).
Provide 1-2 sentences explaining the score.

Return JSON with keys: "score" and "comment".
"""
    )
    return LLMChain(llm=llm, prompt=eval_prompt)

In [29]:
# Recommendation Logic

def recommend_courses(
    profile: str,
    completed_ids: Optional[List[str]],
    df: pd.DataFrame,
    vectorstore: Chroma,
    query_chain: LLMChain,
    top_k: int = 5
) -> List[Tuple[str, float, str]]:
    """
    Generate unique top-k recommendations with query rewriting + expansion.
    """
    if not profile and not completed_ids:
        raise ValueError("Profile and completed courses cannot both be empty.")

    completed_titles = df[df["course_id"].isin(completed_ids or [])]["title"].tolist()

    # Expand + rewrite query
    rewritten_query = query_chain.run({
        "profile": profile or "",
        "completed": ", ".join(completed_titles) if completed_titles else "None"
    })
    logger.info(f"🔎 Expanded Query: {rewritten_query}")

    # Perform semantic search
    results = vectorstore.similarity_search_with_score(rewritten_query, k=top_k * 3)

    # Deduplicate & filter
    seen, recs = set(), []
    for doc, score in results:
        cid = doc.metadata["course_id"]
        if cid not in seen and cid not in (completed_ids or []):
            seen.add(cid)
            recs.append((cid, score, doc.metadata["title"]))
        if len(recs) >= top_k:
            break

    return recs


In [30]:

#Evaluation Section

embeddings, llm = init_openai_clients()
url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
courses = load_dataset(url)
vectorstore = build_vectorstore(courses, embeddings)
query_chain = build_query_rewriter(llm)
eval_chain = build_eval_chain(llm)

test_profiles = [
    ("I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.", ["C101"]),
    ("I know Azure basics and want to manage containers and build CI/CD pipelines.", ["C202"]),
    ("My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.", ["C303"]),
    ("I want to learn to build and deploy microservices with Kubernetes—what courses fit best?", ["C404"]),
    ("Blockchain and smart contracts.", []),  # intentionally incomplete
    ("Data.", []), # incomplete input information
]

eval_results = []

for profile, completed in test_profiles:
    recs = recommend_courses(profile, completed, courses, vectorstore, query_chain)
    rec_ids = [cid for cid, _, _ in recs]
    rec_titles = [title for _, _, title in recs]

    # Use LLM to evaluate relevance
    recommendations_text = ", ".join(rec_titles)
    eval_out = eval_chain.run({"query": profile, "recommendations": recommendations_text})

    eval_results.append({
        "User Query": profile,
        "Completed": completed,
        "Recommended IDs": rec_ids,
        "Recommended Titles": rec_titles,
        "Evaluation": eval_out
    })

eval_df = pd.DataFrame(eval_results)
eval_df


2025-10-01 12:09:55,886 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:09:58,251 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/gpt4o/chat/completions?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:09:58,253 | INFO | 🔎 Expanded Query: "Looking for advanced courses in data visualization and machine learning, having completed 'Python Programming for Data Science'."
2025-10-01 12:09:58,535 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:09:59,510 | INFO | HTTP Request: POST https://eastus.api.cognitive.microsoft.com/openai/deployments/gpt4o/chat/completions?api-version=2024-08-01-preview "HTTP/1.1 200 OK"
2025-10-01 12:10:00,229 | INFO | HTTP Request: POST 

Unnamed: 0,User Query,Completed,Recommended IDs,Recommended Titles,Evaluation
0,I’ve completed the ‘Python Programming for Dat...,[C101],"[C016, C001, C011, C004, C017]","[Python Programming for Data Science, Foundati...","```json\n{\n ""score"": 4,\n ""comment"": ""The r..."
1,I know Azure basics and want to manage contain...,[C202],"[C007, C008, C009, C006, C025]","[Cloud Computing with Azure, DevOps Practices ...","```json\n{\n ""score"": 4,\n ""comment"": ""The r..."
2,My background is in ML fundamentals; I’d like ...,[C303],"[C001, C025, C003, C002, C004]","[Foundations of Machine Learning, MLOps: Produ...","```json\n{\n ""score"": 4,\n ""comment"": ""The r..."
3,I want to learn to build and deploy microservi...,[C404],"[C009, C010, C007, C008, C011]","[Containerization with Docker and Kubernetes, ...","```json\n{\n ""score"": 4,\n ""comment"": ""The r..."
4,Blockchain and smart contracts.,[],"[C023, C022, C001, C024, C021]","[Blockchain Technology and Smart Contracts, In...","```json\n{\n ""score"": 3,\n ""comment"": ""The r..."
5,Data.,[],"[C001, C016, C011, C004, C014]","[Foundations of Machine Learning, Python Progr...","```json\n{\n ""score"": 5,\n ""comment"": ""All r..."


In [31]:
#Pretty Print Example from eval_df
# ================================================================

def print_eval_example(df: pd.DataFrame, idx: int = 0):
    """Prints a single evaluation result in a readable format."""
    if idx >= len(df):
        raise IndexError("Index out of range for eval_df")

    row = df.iloc[idx]

    print("="*80)
    print(f"📝 User Query: {row['User Query']}")
    print(f"✅ Completed Courses: {row['Completed']}")
    print("\n🎯 Recommended Courses:")
    for i, title in enumerate(row["Recommended Titles"], 1):
        print(f"   {i}. {title}")
    print("\n🔍 Evaluation:")
    print(row["Evaluation"])
    print("="*80)


# Example: print the first test case
print_eval_example(eval_df, idx=0)


📝 User Query: I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.
✅ Completed Courses: ['C101']

🎯 Recommended Courses:
   1. Python Programming for Data Science
   2. Foundations of Machine Learning
   3. Big Data Analytics with Spark
   4. Computer Vision and Image Processing
   5. R Programming and Statistical Analysis

🔍 Evaluation:
```json
{
  "score": 4,
  "comment": "The recommended courses are generally relevant, especially 'Foundations of Machine Learning' and 'Big Data Analytics with Spark', which build on data science skills. However, 'R Programming and Statistical Analysis' may be less relevant given the user's interest in data visualization specifically."
}
```


In [32]:
print_eval_example(eval_df, idx=4)

📝 User Query: Blockchain and smart contracts.
✅ Completed Courses: []

🎯 Recommended Courses:
   1. Blockchain Technology and Smart Contracts
   2. Internet of Things (IoT) Development
   3. Foundations of Machine Learning
   4. Augmented and Virtual Reality Development
   5. Cybersecurity Fundamentals

🔍 Evaluation:
```json
{
  "score": 3,
  "comment": "The recommended course 'Blockchain Technology and Smart Contracts' is highly relevant to the user's query, while the other courses are less directly related. The inclusion of IoT, machine learning, and AR/VR may provide some interdisciplinary insights, but they do not specifically address blockchain or smart contracts."
}
```


Here’s how it works step by step:

User input (profile + completed courses) → sent to the LLM query rewriter.

Purpose: rewrite / expand vague input into a clean search query string.

Output is just text, never course IDs.

That rewritten query is then sent to:

results = vectorstore.similarity_search_with_score(rewritten_query, k=top_k * 3)


This calls ChromaDB with embeddings to retrieve the most similar course vectors.

Each returned result is a (doc, score) pair where doc.metadata contains course_id and title.

The filtering step enforces deduplication and removes already completed courses:

seen, recs = set(), []
for doc, score in results:
    cid = doc.metadata["course_id"]
    if cid not in seen and cid not in (completed_ids or []):
        seen.add(cid)
        recs.append((cid, score, doc.metadata["title"]))
    if len(recs) >= top_k:
        break


✅ This ensures the only courses recommended are those indexed in ChromaDB.

The LLM evaluation chain is used only for scoring relevance & commentary, not for generating course IDs.

Example: It gets passed a list of already-chosen titles and says "score": 4, "comment": "Aligned with CI/CD but slightly generic.".


Improvements Delivered

Unique Top-K Recommendations: enforced with seen set.

Query Expansion: vague queries rewritten into rich ones by LLM.

Robustness:

Handles empty/missing inputs.

Deduplication of results.

Uses LLM for relevance evaluation automatically.

Evaluation Section: table with query, recommendations, and LLM-generated commentary.

### Thank You!