In [1]:
%pip install sentence_transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [3]:
# !pip install pandas sentence-transformers faiss-cpu
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from typing import List, Tuple

# Load dataset
url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB02-training/refs/heads/main/Assignments/assignment2dataset.csv"
df = pd.read_csv(url)
df = df.dropna(subset=['course_id', 'title', 'description'])

# Combine title + description for better context
df["text"] = df["title"] + ". " + df["description"]

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute course embeddings
course_embeddings = model.encode(df["text"].tolist(), show_progress_bar=True)
course_embeddings = np.array(course_embeddings).astype("float32")

# Index embeddings using FAISS
dimension = course_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(course_embeddings)

# Mapping from index to course_id
id_mapping = df["course_id"].tolist()

# Recommendation function
def recommend_courses(profile: str, completed_ids: List[str], top_k=5) -> List[Tuple[str, float]]:
    query_embedding = model.encode([profile])[0].astype("float32")
    distances, indices = index.search(np.array([query_embedding]), top_k + len(completed_ids))
    recommendations = []
    for i, dist in zip(indices[0], distances[0]):
        course_id = id_mapping[i]
        if course_id not in completed_ids:
            similarity = 1 / (1 + dist)
            recommendations.append((course_id, round(similarity, 4)))
        if len(recommendations) >= top_k:
            break
    return recommendations

# Display helper
def print_recommendations(profile: str, completed_ids: List[str]):
    print(f" Profile: {profile}")
    print(f" Completed: {completed_ids}")
    results = recommend_courses(profile, completed_ids)
    print("\n Top-5 Course Recommendations:")
    for course_id, score in results:
        course_info = df[df["course_id"] == course_id].iloc[0]
        print(f"- {course_info['title']} (ID: {course_id}, Score: {score})")

# ----------------------------
# Test Cases
# ----------------------------

test_profiles = [
    {
        "profile": "I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.",
        "completed": ["DS101"]
    },
    {
        "profile": "I know Azure basics and want to manage containers and build CI/CD pipelines.",
        "completed": ["AZ101"]
    },
    {
        "profile": "My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.",
        "completed": ["ML201"]
    },
    {
        "profile": "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
        "completed": ["CN101"]
    },
    {
        "profile": "I’m interested in blockchain and smart contracts but have no prior experience.",
        "completed": []
    },
]

# Run all test cases
for case in test_profiles:
    print_recommendations(case["profile"], case["completed"])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.76it/s]

 Profile: I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.
 Completed: ['DS101']

 Top-5 Course Recommendations:
- Python Programming for Data Science (ID: C016, Score: 0.6652)
- R Programming and Statistical Analysis (ID: C017, Score: 0.5432)
- Data Visualization with Tableau (ID: C014, Score: 0.4931)
- Foundations of Machine Learning (ID: C001, Score: 0.4729)
- Big Data Analytics with Spark (ID: C011, Score: 0.4624)
 Profile: I know Azure basics and want to manage containers and build CI/CD pipelines.
 Completed: ['AZ101']

 Top-5 Course Recommendations:
- Cloud Computing with Azure (ID: C007, Score: 0.568)
- DevOps Practices and CI/CD (ID: C008, Score: 0.5403)
- Containerization with Docker and Kubernetes (ID: C009, Score: 0.5311)
- MLOps: Productionizing Machine Learning (ID: C025, Score: 0.4568)
- Data Engineering on AWS (ID: C006, Score: 0.452)
 Profile: My background is in ML fundamentals; I’d like to specialize in neural networks and


