Import.env and dataset.csv file

In [5]:
from google.colab import files

print("📂 Please choose your `.env` file...")
uploaded_env = files.upload()

print("📂 Now choose your `assignment2dataset.csv` file...")
uploaded_csv = files.upload()


📂 Please choose your `.env` file...


Saving embedding.env to embedding (1).env
📂 Now choose your `assignment2dataset.csv` file...


Saving assignment2dataset.csv to assignment2dataset (1).csv


Load .env file

In [7]:
from dotenv import load_dotenv
import os

# Load  .env file
load_dotenv("embedding.env")


True

Azure OpenAI Configuration

In [22]:
# Set Azure OpenAI config
import openai

openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_ENDPOINT")
openai.api_version = os.getenv("OPENAI_API_VERSION")
embedding_deployment = os.getenv("DEPLOYMENT_NAME")

print("Azure OpenAI config loaded:")
print(f"Base URL         : {openai.api_base}")
print(f"API Version      : {openai.api_version}")
print(f"Deployment Name  : {embedding_deployment}")

Azure OpenAI config loaded:
Base URL         : https://courserecommendation.openai.azure.com/
API Version      : 2023-05-15
Deployment Name  : text-embedding-ada-002


In [10]:
# Check the actual columns in your CSV
print("Available columns in the dataset:")
print(df.columns.tolist())



Available columns in the dataset:
['course_id', 'title', 'description']


Load Dataset and combine columns for better context

In [12]:
import pandas as pd

# Get uploaded file name
dataset_filename = list(uploaded_csv.keys())[0]

# Load the CSV
df = pd.read_csv(dataset_filename)

# Combine 'title' and 'description' for better embedding context
df['text'] = df['title'] + " " + df['description']

print("Dataset loaded and combined text column created!")
df.head()


Dataset loaded and combined text column created!


Unnamed: 0,course_id,title,description,text
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...,Foundations of Machine Learning Understand fou...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...,Deep Learning with TensorFlow and Keras Explor...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...,Natural Language Processing Fundamentals Dive ...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...,Computer Vision and Image Processing Learn the...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...,Reinforcement Learning Basics Get introduced t...


In [24]:
# Function to get embedding using Azure OpenAI
def get_embedding(text, engine=embedding_deployment):
    response = openai.Embedding.create(
        input=[text],
        engine=engine
    )
    return response["data"][0]["embedding"]

# Apply the embedding function to each course text
import time

print("⏳ Generating embeddings. Please wait...")

df['embedding'] = df['text'].apply(lambda x: get_embedding(x))
time.sleep(1)  # Add delay if needed to avoid hitting rate limits

print("Embeddings generated for all courses!")
df[['course_id', 'title', 'embedding']].head()


⏳ Generating embeddings. Please wait...
Embeddings generated for all courses!


Unnamed: 0,course_id,title,embedding
0,C001,Foundations of Machine Learning,"[0.00013334676623344421, 0.009072089567780495,..."
1,C002,Deep Learning with TensorFlow and Keras,"[-0.009982207790017128, -0.0030571341048926115..."
2,C003,Natural Language Processing Fundamentals,"[0.001481331535615027, 0.01443442888557911, 0...."
3,C004,Computer Vision and Image Processing,"[3.342208219692111e-05, 0.011540415696799755, ..."
4,C005,Reinforcement Learning Basics,"[0.006548785604536533, 0.005374978296458721, 0..."


In [25]:
import pickle

with open("course_embeddings.pkl", "wb") as f:
    pickle.dump(df, f)

print("Embeddings saved to 'course_embeddings.pkl'")


Embeddings saved to 'course_embeddings.pkl'


In [26]:
# Convert embeddings to string for saving in CSV
df['embedding_str'] = df['embedding'].apply(lambda x: ','.join(map(str, x)))
df[['course_id', 'title', 'description', 'embedding_str']].to_csv("course_embeddings.csv", index=False)

print("Embeddings saved to 'course_embeddings.csv'")


Embeddings saved to 'course_embeddings.csv'


In [27]:
import numpy as np

# Load the saved DataFrame from the pickle file
with open("course_embeddings.pkl", "rb") as f:
    df = pickle.load(f)

print("Pickle file loaded successfully!")


Pickle file loaded successfully!


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Prepare matrix of embeddings for similarity calculation
embedding_matrix = np.vstack(df['embedding'].values)
course_ids = df['course_id'].tolist()


In [19]:
def recommend_courses(profile: str, completed_ids: list, top_k: int = 5):
    # Get embedding for the user profile
    profile_embedding = get_embedding(profile)

    # Compute cosine similarities
    similarities = cosine_similarity([profile_embedding], embedding_matrix)[0]

    # Create DataFrame with scores
    sim_df = pd.DataFrame({
        'course_id': course_ids,
        'similarity': similarities
    })

    # Exclude completed courses
    sim_df = sim_df[~sim_df['course_id'].isin(completed_ids)]

    # Return top-k most similar courses
    top_recommendations = sim_df.sort_values(by='similarity', ascending=False).head(top_k)

    return top_recommendations.merge(df[['course_id', 'title', 'description']], on='course_id')


In [28]:
# Define sample user queries
test_queries = [
    {
        "query": "I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?",
        "completed_ids": ["C101"]  # Replace with actual ID if available
    },
    {
        "query": "I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.",
        "completed_ids": ["C102"]
    },
    {
        "query": "My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.",
        "completed_ids": ["C103"]
    },
    {
        "query": "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
        "completed_ids": ["C104"]
    },
    {
        "query": "I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?",
        "completed_ids": []
    }
]

# Run recommendations for each query
for i, test in enumerate(test_queries, 1):
    print(f"\nTest Profile {i}: {test['query']}")
    result = recommend_courses(test['query'], test['completed_ids'])
    display(result[['course_id', 'title', 'similarity']])



Test Profile 1: I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?


Unnamed: 0,course_id,title,similarity
0,C016,Python Programming for Data Science,0.833197
1,C011,Big Data Analytics with Spark,0.805967
2,C014,Data Visualization with Tableau,0.795988
3,C004,Computer Vision and Image Processing,0.787141
4,C017,R Programming and Statistical Analysis,0.785616



Test Profile 2: I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.


Unnamed: 0,course_id,title,similarity
0,C007,Cloud Computing with Azure,0.839967
1,C009,Containerization with Docker and Kubernetes,0.831279
2,C008,DevOps Practices and CI/CD,0.821829
3,C025,MLOps: Productionizing Machine Learning,0.799056
4,C010,APIs and Microservices Architecture,0.792597



Test Profile 3: My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.


Unnamed: 0,course_id,title,similarity
0,C025,MLOps: Productionizing Machine Learning,0.840254
1,C002,Deep Learning with TensorFlow and Keras,0.817367
2,C004,Computer Vision and Image Processing,0.807474
3,C003,Natural Language Processing Fundamentals,0.807295
4,C001,Foundations of Machine Learning,0.807115



Test Profile 4: I want to learn to build and deploy microservices with Kubernetes—what courses fit best?


Unnamed: 0,course_id,title,similarity
0,C009,Containerization with Docker and Kubernetes,0.881675
1,C010,APIs and Microservices Architecture,0.833318
2,C008,DevOps Practices and CI/CD,0.810695
3,C025,MLOps: Productionizing Machine Learning,0.80958
4,C007,Cloud Computing with Azure,0.808407



Test Profile 5: I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?


Unnamed: 0,course_id,title,similarity
0,C023,Blockchain Technology and Smart Contracts,0.851611
1,C010,APIs and Microservices Architecture,0.754336
2,C009,Containerization with Docker and Kubernetes,0.748929
3,C013,NoSQL Databases and MongoDB,0.748423
4,C022,Internet of Things (IoT) Development,0.747743


In [29]:
# Accept user profile and completed course IDs
user_profile = input("📥 Describe your learning goal (e.g., 'I want to master DevOps and containers using Azure'): ")

completed_input = input("📄 Enter completed course IDs (comma-separated), or leave blank if none: ")
completed_ids = [cid.strip() for cid in completed_input.split(",")] if completed_input else []

# Run recommendation
recommendations = recommend_courses(user_profile, completed_ids)

# Display result
print("\nTop Course Recommendations:")
display(recommendations[['course_id', 'title', 'similarity']])


📥 Describe your learning goal (e.g., 'I want to master DevOps and containers using Azure'): I want to learn to build and deploy microservices with Kubernetes—what courses fit best?
📄 Enter completed course IDs (comma-separated), or leave blank if none: 

Top Course Recommendations:


Unnamed: 0,course_id,title,similarity
0,C009,Containerization with Docker and Kubernetes,0.881675
1,C010,APIs and Microservices Architecture,0.833318
2,C008,DevOps Practices and CI/CD,0.810695
3,C025,MLOps: Productionizing Machine Learning,0.80958
4,C007,Cloud Computing with Azure,0.808407


USING CHROMADB

In [30]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings())

collection = client.create_collection(name="courses")

# Ingest course data into ChromaDB
for idx, row in df.iterrows():
    collection.add(
        documents=[row['text']],
        metadatas=[{"course_id": row['course_id'], "title": row['title']}],
        ids=[row['course_id']],
        embeddings=[row['embedding']]
    )


In [31]:
query_embedding = get_embedding("I want to learn about Azure DevOps and containers")

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for match in results["metadatas"][0]:
    print(f"📘 Course: {match['title']} (ID: {match['course_id']})")


📘 Course: Containerization with Docker and Kubernetes (ID: C009)
📘 Course: DevOps Practices and CI/CD (ID: C008)
📘 Course: Cloud Computing with Azure (ID: C007)
📘 Course: MLOps: Productionizing Machine Learning (ID: C025)
📘 Course: APIs and Microservices Architecture (ID: C010)


In [33]:
# Create ChromaDB client and collection
client = chromadb.Client(Settings())
collection = client.get_or_create_collection(name="courses")

# Populate with course embeddings (only once)
for idx, row in df.iterrows():
    collection.add(
        documents=[row['text']],
        metadatas=[{
            "course_id": row['course_id'],
            "title": row['title'],
            "description": row['description']
        }],
        ids=[row['course_id']],
        embeddings=[row['embedding']]
    )

print("Course collection populated in ChromaDB")


Course collection populated in ChromaDB


In [34]:
# Get input from user
profile_input = input("Describe your learning goal: ")
completed_input = input("Enter completed course IDs (comma-separated, optional): ")
completed_ids = [cid.strip() for cid in completed_input.split(",")] if completed_input else []

# Get embedding for user query
user_embedding = get_embedding(profile_input)

# Query ChromaDB for top 5 results
results = collection.query(
    query_embeddings=[user_embedding],
    n_results=10  # Fetch more to allow filtering
)

# Filter out completed courses
filtered_results = []
for meta, score in zip(results["metadatas"][0], results["distances"][0]):
    if meta["course_id"] not in completed_ids:
        filtered_results.append((meta["course_id"], meta["title"], 1 - score))  # convert distance to similarity

# Show top 5
print("\nTop 5 Recommended Courses:")
for cid, title, sim in filtered_results[:5]:
    print(f"🔹 {title} (ID: {cid}) - Similarity: {sim:.4f}")


Describe your learning goal: I want to learn about Azure DevOps and containers
Enter completed course IDs (comma-separated, optional): 

Top 5 Recommended Courses:
🔹 Containerization with Docker and Kubernetes (ID: C009) - Similarity: 0.7106
🔹 DevOps Practices and CI/CD (ID: C008) - Similarity: 0.6836
🔹 Cloud Computing with Azure (ID: C007) - Similarity: 0.6662
🔹 MLOps: Productionizing Machine Learning (ID: C025) - Similarity: 0.6237
🔹 APIs and Microservices Architecture (ID: C010) - Similarity: 0.6035
