In [6]:
# Setup

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# Install NLTK and other packages
!pip list | grep nltk
! pip install -U kaleido
import nltk

nltk.download('punkt')  
nltk.download('wordnet')  

# Unzip per this stackoverflow: https://stackoverflow.com/questions/73849624/getting-error-while-submitting-notebook-on-kaggle-even-after-importing-nltk-libr
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

nltk                               3.2.4
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [9]:
# Data cleaning and normalization

# Normalize/clean course data to the name, topic, link, text format for now

dataMit = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/MIT ocw.csv")
dataMit.columns = map(str.lower, dataMit.columns)
dataMit.rename(columns={'name ': 'name'}, inplace=True)
dataMit.rename(columns={'course link': 'link'}, inplace=True)
dataMit['text'] = dataMit['name'] + " " + dataMit['topic'] 
dataMit['provider'] = 'Massachussets Institute of Technology'
dataMit = dataMit[['name', 'topic', 'link', 'provider', 'text']]


dataHarvard = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/Harvard_university.csv")
dataHarvard.columns = map(str.lower, dataHarvard.columns)
dataHarvard.rename(columns={'link to course': 'link', 'about': 'topic'}, inplace=True)
dataHarvard = dataHarvard[dataHarvard['price'] == 'Free']
dataHarvard['text'] = dataHarvard['name'] + " " + dataHarvard['topic'] 
dataHarvard['provider'] = 'Harvard University'
dataHarvard = dataHarvard[['name', 'topic', 'link', 'provider', 'text']]


dataEdx = pd.read_csv("/kaggle/input/edx-courses-dataset-2021/EdX.csv")
dataEdx.columns = map(str.lower, dataEdx.columns)
dataEdx["topic"] = dataEdx['about'] + '. ' + dataEdx['course description']
dataEdx["provider"] = 'edX - ' + dataEdx['university']
dataEdx['text'] = dataEdx['name'] + " " + dataEdx["topic"]
dataEdx = dataEdx[['name', 'topic', 'link', 'provider', 'text']]


dataUdemy = pd.read_csv("/kaggle/input/udemy-course-dataset-categories-ratings-and-trends/udemy_courses.csv")
dataUdemy.columns = map(str.lower, dataUdemy.columns)
dataUdemy.rename(columns={
    'title': 'name',
    'headline': 'topic',
    'url': 'link',
}, inplace=True)
# only keep free courses
dataUdemy = dataUdemy[dataUdemy['is_paid'] == False]
# Since Udemy courses are user generated, filter only courses with rating over 4.5
dataUdemy['provider'] = 'Udemy'
dataUdemy = dataUdemy[dataUdemy['rating'] > 4.5 ]
dataUdemy['text'] = dataUdemy['name'] + " " + dataUdemy['topic']
dataUdemy = dataUdemy[['name', 'topic', 'link', 'provider', 'text']]


dataCoursera = pd.read_csv("/kaggle/input/coursera-free-courses-dataset/coursera.csv")
dataCoursera.rename(columns={
    'title': 'name',
    'skills': 'topic',
    'url': 'link',
}, inplace=True)
dataCoursera = dataCoursera[dataCoursera['price'] == 'Free']
dataCoursera['text'] = dataCoursera['name'] + " " + np.where(pd.notna(dataCoursera['topic']), dataCoursera['topic'], "")

dataCoursera['provider'] = 'Coursera - ' + dataCoursera['course_by']
dataCoursera = dataCoursera[['name', 'topic', 'link', 'provider', 'text']]
dataCoursera = dataCoursera.fillna("") # Fill null values



                                                    name  \
26443                           Stock Market Foundations   
26445  The Complete Course On Understanding Blockchai...   
26446  Bitcoin or How I Learned to Stop Worrying and ...   
26448  Blockchain cryptocurrency course 101 for absol...   
26449  Trading Options For Consistent Returns: Option...   

                                                   topic  \
26443  The Market isn't a Mystery, It’s a Playground....   
26445  A Beginner's Guide to Authentic Knowledge on B...   
26446  The definitive guide to understand what the bi...   
26448  A complete guide to anyone who wants to really...   
26449                     The Foundation For Consistency   

                                                    link provider  \
26443  https://www.udemy.com/course/how-to-invest-in-...    Udemy   
26445  https://www.udemy.com/course/understanding-blo...    Udemy   
26446  https://www.udemy.com/course/bitcoin-or-how-i-...    Udemy   
26

In [None]:
def clean_text(text):
    lemma = WordNetLemmatizer() # lemmatizer
    text = re.sub("[^A-Za-z0-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text) # look into this tokenization
    tokens = [lemma.lemmatize(word) for word in tokens # lemmatize words and remove stopwords 
                if word not in stopwords.words("english")]
    return " ".join(tokens) # SBERT rrequires joined tokens

#Combine and clean data
data = pd.concat([dataUdemy, dataMit, dataHarvard, dataEdx, dataCoursera])
data['cleaned_text'] = data['text'].apply(clean_text) # Add clean text column to dataframe
data.head()

In [13]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Get a list of the document embedding vector for each sentence in the cleaned text data. The indices will be aligned with the original course rows in dataframe
document_embeddings = model.encode(data['cleaned_text'].tolist())

Batches:   0%|          | 0/188 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
# Use previous functions to process user input into vector and use cosine 
# Cosine Similarity to find the most related courses
def recommend_courses(user_input, document_embeddings, data, model, top_n=5):
    cleaned_input = clean_text(user_input)
    input_embedding = model.encode([cleaned_input]) # Model must be initialized
    similarities = cosine_similarity([input_embedding], document_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    recommendations = data.iloc[top_indices][['name', 'topic', 'link', 'provider']]
    return recommendations


user_input = "Lagrange Multipliers"
recommendations = recommend_courses(user_input, document_embeddings, data, model)
recommendations.head()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.

# Important Notes

* Original Source: https://www.kaggle.com/code/shtrausslearning/nlp-edx-course-recommendations
* DeepSeek AI for original base which also calls the document embeddings for the input
* I have added comments to better study and understand the code as a base to build off of
* Instead of directly matching to a course index in the dataset which limits the use of the model

# Todo (to better understand and be able to present on this topic)

* Study cosine similarity\
* Word2Vec and stopwords/lemmatization - switched to SBERT - be able to explain this
* Coursera, futurelearn, udemy
* Build a frontend for the app

# Improvements

reshaped corpus as list instead of series, increased context window, 