In [1]:
# Setup

# Install NLTK and other packages
!pip list | grep nltk
! pip install -U kaleido
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import nltk




nltk.download('punkt')  
nltk.download('wordnet')  

# Unzip per this stackoverflow: https://stackoverflow.com/questions/73849624/getting-error-while-submitting-notebook-on-kaggle-even-after-importing-nltk-libr
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

nltk                               3.2.4
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/word

In [2]:
# Data cleaning and normalization

# Normalize/clean course data to the name, topic, link, text format for now

dataMit = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/MIT ocw.csv")
dataMit.columns = map(str.lower, dataMit.columns)
dataMit.rename(columns={'name ': 'name'}, inplace=True)
dataMit.rename(columns={'course link': 'link'}, inplace=True)
dataMit['text'] = dataMit['name'] + " " + dataMit['topic'] 
dataMit['provider'] = 'Massachussets Institute of Technology'
dataMit = dataMit[['name', 'topic', 'link', 'provider', 'text']]


dataHarvard = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/Harvard_university.csv")
dataHarvard.columns = map(str.lower, dataHarvard.columns)
dataHarvard.rename(columns={'link to course': 'link', 'about': 'topic'}, inplace=True)
dataHarvard = dataHarvard[dataHarvard['price'] == 'Free']
dataHarvard['text'] = dataHarvard['name'] + " " + dataHarvard['topic'] 
dataHarvard['provider'] = 'Harvard University'
dataHarvard = dataHarvard[['name', 'topic', 'link', 'provider', 'text']]


dataEdx = pd.read_csv("/kaggle/input/edx-courses-dataset-2021/EdX.csv")
dataEdx.columns = map(str.lower, dataEdx.columns)
dataEdx["topic"] = dataEdx['about'] + '. ' + dataEdx['course description']
dataEdx["provider"] = 'edX - ' + dataEdx['university']
dataEdx['text'] = dataEdx['name'] + " " + dataEdx["topic"]
dataEdx = dataEdx[['name', 'topic', 'link', 'provider', 'text']]


dataUdemy = pd.read_csv("/kaggle/input/udemy-course-dataset-categories-ratings-and-trends/udemy_courses.csv")
dataUdemy.columns = map(str.lower, dataUdemy.columns)
dataUdemy.rename(columns={
    'title': 'name',
    'headline': 'topic',
    'url': 'link',
}, inplace=True)
# only keep free courses
dataUdemy = dataUdemy[dataUdemy['is_paid'] == False]
# Since Udemy courses are user generated, filter only courses with rating over 4.5
dataUdemy['provider'] = 'Udemy'
dataUdemy = dataUdemy[dataUdemy['rating'] > 4.5 ]
dataUdemy['text'] = dataUdemy['name'] + " " + dataUdemy['topic']
dataUdemy = dataUdemy[['name', 'topic', 'link', 'provider', 'text']]


dataCoursera = pd.read_csv("/kaggle/input/coursera-free-courses-dataset/coursera.csv")
dataCoursera.rename(columns={
    'title': 'name',
    'skills': 'topic',
    'url': 'link',
}, inplace=True)
dataCoursera = dataCoursera[dataCoursera['price'] == 'Free']
dataCoursera['text'] = dataCoursera['name'] + " " + np.where(pd.notna(dataCoursera['topic']), dataCoursera['topic'], "")

dataCoursera['provider'] = 'Coursera - ' + dataCoursera['course_by']
dataCoursera = dataCoursera[['name', 'topic', 'link', 'provider', 'text']]

In [3]:
def clean_text(text):
    lemma = WordNetLemmatizer() # lemmatizer
    text = re.sub("[^A-Za-z0-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text) # look into this tokenization
    tokens = [lemma.lemmatize(word) for word in tokens # lemmatize words and remove stopwords 
                if word not in stopwords.words("english")]
    return " ".join(tokens) # SBERT rrequires joined tokens

#Combine and clean data
data = pd.concat([dataUdemy, dataMit, dataHarvard, dataEdx, dataCoursera])
data['cleaned_text'] = data['text'].apply(clean_text) # Add clean text column to dataframe

# Drop non-english courses
indices_to_drop = [index for index, row in data.iterrows() if bool(re.search(r'[^\x00-\x7F\u2000-\u206F\u2600-\u26FF\u2700-\u27BF]', str(row['text'])))]
data = data.drop(indices_to_drop)

data.head()

Unnamed: 0,name,topic,link,provider,text,cleaned_text
26443,Stock Market Foundations,"The Market isn't a Mystery, It’s a Playground....",https://www.udemy.com/course/how-to-invest-in-...,Udemy,Stock Market Foundations The Market isn't a My...,stock market foundation market isnt mystery pl...
26445,The Complete Course On Understanding Blockchai...,A Beginner's Guide to Authentic Knowledge on B...,https://www.udemy.com/course/understanding-blo...,Udemy,The Complete Course On Understanding Blockchai...,complete course understanding blockchain techn...
26446,Bitcoin or How I Learned to Stop Worrying and ...,The definitive guide to understand what the bi...,https://www.udemy.com/course/bitcoin-or-how-i-...,Udemy,Bitcoin or How I Learned to Stop Worrying and ...,bitcoin learned stop worrying love crypto defi...
26448,Blockchain cryptocurrency course 101 for absol...,A complete guide to anyone who wants to really...,https://www.udemy.com/course/blockchain-crypto...,Udemy,Blockchain cryptocurrency course 101 for absol...,blockchain cryptocurrency course 101 absolute ...
26449,Trading Options For Consistent Returns: Option...,The Foundation For Consistency,https://www.udemy.com/course/trading-options-f...,Udemy,Trading Options For Consistent Returns: Option...,trading option consistent return option basic ...


In [4]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get a list of the document embedding vector for each sentence in the cleaned text data. The indices will be aligned with the original course rows in dataframe
document_embeddings = model.encode(data['cleaned_text'].tolist())

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/164 [00:00<?, ?it/s]

In [5]:
# Export a csv with embeddings for fastapi
data = data[['name', 'topic', 'link', 'provider']]
data.to_csv('courses.csv', index=False)
embeddings = pd.DataFrame(document_embeddings)
embeddings.to_csv('embeddings.csv', index=False)

In [8]:
# Use previous functions to process user input into vector and use cosine 
# Cosine Similarity to find the most related courses
def recommend_courses(user_input, document_embeddings, data, model, top_n=5):
    cleaned_input = clean_text(user_input)
    input_embedding = model.encode([cleaned_input]) # Model must be initialized
    similarities = cosine_similarity(input_embedding, document_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    recommendations = data.iloc[top_indices][['name', 'topic', 'link', 'provider']]
    return recommendations


user_input = "Python Data Analytics"
recommendations = recommend_courses(user_input, document_embeddings, data, model)
recommendations.head()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,name,topic,link,provider
65976,Marketing Analytics with Python: From Data to ...,Beginner to Advanced,https://www.udemy.com/course/python-for-market...,Udemy
61,Introduction to Data Science with Python,Join Harvard University instructor Pavlos Prot...,https://pll.harvard.edu/course/introduction-da...,Harvard University
780,Data Processing Using Python,"Computer Programming, Python Programming, Comp...",https://www.coursera.org/learn/python-data-pro...,Coursera - Nanjing University
309,Analytics in Python,Learn the fundamental of programming in Python...,https://www.edx.org/course/analytics-in-python,edX - Columbia University
138,Probability and Statistics in Data Science usi...,"Using Python, learn statistical and probabilis...",https://www.edx.org/course/probability-and-sta...,"edX - The University of California, San Diego"


In [9]:
# Testing qwen workflow setup
# For Qwen 1.5b inference

# Imports + installs

import gc
import torch
from IPython.display import display, Markdown, Latex, HTML
import time
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

!pip install mistletoe
import mistletoe

torch.cuda.empty_cache()
gc.collect()

torch.cuda.empty_cache()  # Clears unused cached memory
torch.cuda.ipc_collect()  # Collects unused memory

print("Using GPU:", torch.cuda.get_device_name(0))
print(f'\n\nMemory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Collecting mistletoe
  Downloading mistletoe-1.4.0-py3-none-any.whl.metadata (1.2 kB)
Downloading mistletoe-1.4.0-py3-none-any.whl (51 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mistletoe
Successfully installed mistletoe-1.4.0
Using GPU: Tesla P100-PCIE-16GB


Memory Usage:
Allocated: 0.1 GB
Cached:    0.1 GB


In [10]:
# Inference LLM/setup


# Load the Qwen 1.5b model
model_name = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-1.5b/2"
model_qwen = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the Qwen query function
def ask_model(system="You are a search query optimizer.", prompt="Optimize this search query:"):
    messages = [{"role": "system", "content": system}, {"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model_qwen.device)
    generated_ids = model_qwen.generate(**model_inputs, max_new_tokens=3000, pad_token_id=tokenizer.eos_token_id)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

# Example usage
user_input = "Chaucer and Middle English Literature"
optimized_query = ask_model(prompt=f"Optimize this search query: {user_input}")
recommendations = recommend_courses(optimized_query, document_embeddings, data, model_bert)
print(recommendations)



KeyboardInterrupt: 

* https://huggingface.co/docs/transformers/en/model_doc/bert