In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multi-platform-online-courses-dataset/edx.csv
/kaggle/input/multi-platform-online-courses-dataset/skillshare.csv
/kaggle/input/multi-platform-online-courses-dataset/Udemy.csv
/kaggle/input/multi-platform-online-courses-dataset/Coursera.csv
/kaggle/input/edx-courses-dataset-2021/EdX.csv
/kaggle/input/dataset-of-1200-coursera-courses/edx.csv
/kaggle/input/dataset-of-1200-coursera-courses/Barkeley_extension.csv
/kaggle/input/dataset-of-1200-coursera-courses/Oxford.csv
/kaggle/input/dataset-of-1200-coursera-courses/Stanford.csv
/kaggle/input/dataset-of-1200-coursera-courses/udacity.csv
/kaggle/input/dataset-of-1200-coursera-courses/alison.csv
/kaggle/input/dataset-of-1200-coursera-courses/MIT ocw.csv
/kaggle/input/dataset-of-1200-coursera-courses/london school of economics.csv
/kaggle/input/dataset-of-1200-coursera-courses/coursera_update.csv
/kaggle/input/dataset-of-1200-coursera-courses/pluralsight.csv
/kaggle/input/dataset-of-1200-coursera-courses/futurelearn.csv
/kaggle/i

In [2]:
# Install NLTK
!pip list | grep nltk
! pip install -U kaleido
import nltk

nltk.download('punkt')  
nltk.download('wordnet')  

# Unzip per this stackoverflow: https://stackoverflow.com/questions/73849624/getting-error-while-submitting-notebook-on-kaggle-even-after-importing-nltk-libr
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

nltk                               3.2.4
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/word

In [3]:
# Setup

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# Clean text 
def clean_text(text):
    lemma = WordNetLemmatizer() # lemmatizer
    text = re.sub("[^A-Za-z0-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text) # look into this tokenization
    tokens = [lemma.lemmatize(word) for word in tokens # lemmatize words and remove stopwords 
                if word not in stopwords.words("english")]
    return tokens


# Get the sentence embeddings for each course and user input with this function
# First get the word embeddings and average them out for the sentence (aka course/input)
# overall embedding

def get_document_embedding(doc, model):
    embeddings = [model.wv[word] for word in doc if word in model.wv] # Get individual embeddings into a list
    # Consider implementing exception handling 
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0) 
    else:
        return np.zeros(model.vector_size)

# Use previous functions to process user input into vector and use cosine 
# Similarity to find the most related courses
def recommend_courses(user_input, document_embeddings, data, top_n=5):
    cleaned_input = clean_text(user_input)
    input_embedding = get_document_embedding(cleaned_input, model)
    similarities = cosine_similarity([input_embedding], document_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    recommendations = data.iloc[top_indices][['name', 'topic', 'link', 'provider']]
    return recommendations

        

In [19]:
# Experiment with other data
# edx, coursera, harvard, mit ocw
# https://sparkbyexamples.com/pandas/pandas-read-multiple-csv-files/#:~:text=Load%20each%20file%20into%20individual,each%20file%20individually%20if%20needed.

# Normalize/clean course data to the name, topic, link, text format for now

dataMit = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/MIT ocw.csv")
# print(dataMit.head())
dataMit.columns = map(str.lower, dataMit.columns)
dataMit.rename(columns={'name ': 'name'}, inplace=True)
dataMit.rename(columns={'course link': 'link'}, inplace=True)
dataMit['text'] = dataMit['name'] + " " + dataMit['topic'] 
dataMit['provider'] = 'Massachussets Institute of Technology'
dataMit = dataMit[['name', 'topic', 'link', 'provider', 'text']]


dataHarvard = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/Harvard_university.csv")
# print(dataHarvard.head())
dataHarvard.columns = map(str.lower, dataHarvard.columns)
dataHarvard.rename(columns={'link to course': 'link', 'about': 'topic'}, inplace=True)
dataHarvard = dataHarvard[dataHarvard['price'] == 'Free']
dataHarvard['text'] = dataHarvard['name'] + " " + dataHarvard['topic'] 
dataHarvard['provider'] = 'Harvard University'
dataHarvard = dataHarvard[['name', 'topic', 'link', 'provider', 'text']]


dataEdx = pd.read_csv("/kaggle/input/edx-courses-dataset-2021/EdX.csv")
# print(dataEdx.head())
dataEdx.columns = map(str.lower, dataEdx.columns)
dataEdx["topic"] = dataEdx['about'] + '. ' + dataEdx['course description']
dataEdx["provider"] = 'edX - ' + dataEdx['university']
dataEdx['text'] = dataEdx['name'] + " " + dataEdx["topic"]
dataEdx = dataEdx[['name', 'topic', 'link', 'provider', 'text']]


# Udemy
dataUdemy = pd.read_csv("/kaggle/input/udemy-course-dataset-categories-ratings-and-trends/udemy_courses.csv")
dataUdemy.columns = map(str.lower, dataUdemy.columns)
dataUdemy.rename(columns={
    'title': 'name',
    'headline': 'topic',
    'url': 'link',
}, inplace=True)
# only keep free courses
dataUdemy = dataUdemy[dataUdemy['is_paid'] == False]
# Since Udemy courses are user generated, filter only courses with rating over 4.5
dataUdemy['provider'] = 'Udemy'
dataUdemy = dataUdemy[dataUdemy['rating'] > 4.5 ]
dataUdemy['text'] = dataUdemy['name'] + " " + dataUdemy['topic']
dataUdemy = dataUdemy[['name', 'topic', 'link', 'provider', 'text']]
print(dataUdemy.head())


# Coursera
dataCoursera = pd.read_csv("/kaggle/input/coursera-free-courses-dataset/coursera.csv")
dataCoursera.rename(columns={
    'title': 'name',
    'skills': 'topic',
    'url': 'link',
}, inplace=True)
dataCoursera = dataCoursera[dataCoursera['price'] == 'Free']
dataCoursera['text'] = dataCoursera['name'] + " " + np.where(pd.notna(dataCoursera['topic']), dataCoursera['topic'], "")

dataCoursera['provider'] = 'Coursera - ' + dataCoursera['course_by']
dataCoursera = dataCoursera[['name', 'topic', 'link', 'provider', 'text']]
dataCoursera = dataCoursera.fillna("") # Fill null values

# print(dataCoursera.head())

                                                    name  \
26443                           Stock Market Foundations   
26445  The Complete Course On Understanding Blockchai...   
26446  Bitcoin or How I Learned to Stop Worrying and ...   
26448  Blockchain cryptocurrency course 101 for absol...   
26449  Trading Options For Consistent Returns: Option...   

                                                   topic  \
26443  The Market isn't a Mystery, It’s a Playground....   
26445  A Beginner's Guide to Authentic Knowledge on B...   
26446  The definitive guide to understand what the bi...   
26448  A complete guide to anyone who wants to really...   
26449                     The Foundation For Consistency   

                                                    link provider  \
26443  https://www.udemy.com/course/how-to-invest-in-...    Udemy   
26445  https://www.udemy.com/course/understanding-blo...    Udemy   
26446  https://www.udemy.com/course/bitcoin-or-how-i-...    Udemy   
26

In [20]:
# Create a Combined Dataframe
data = pd.concat([dataUdemy, dataMit, dataHarvard, dataEdx, dataCoursera])
data.head()

Unnamed: 0,name,topic,link,provider,text
26443,Stock Market Foundations,"The Market isn't a Mystery, It’s a Playground....",https://www.udemy.com/course/how-to-invest-in-...,Udemy,Stock Market Foundations The Market isn't a My...
26445,The Complete Course On Understanding Blockchai...,A Beginner's Guide to Authentic Knowledge on B...,https://www.udemy.com/course/understanding-blo...,Udemy,The Complete Course On Understanding Blockchai...
26446,Bitcoin or How I Learned to Stop Worrying and ...,The definitive guide to understand what the bi...,https://www.udemy.com/course/bitcoin-or-how-i-...,Udemy,Bitcoin or How I Learned to Stop Worrying and ...
26448,Blockchain cryptocurrency course 101 for absol...,A complete guide to anyone who wants to really...,https://www.udemy.com/course/blockchain-crypto...,Udemy,Blockchain cryptocurrency course 101 for absol...
26449,Trading Options For Consistent Returns: Option...,The Foundation For Consistency,https://www.udemy.com/course/trading-options-f...,Udemy,Trading Options For Consistent Returns: Option...


In [61]:
# The list of tokenized sentences, ie our Corpus 
data['cleaned_text'] = data['text'].apply(clean_text) # Add clean text column to dataframe
data['cleaned_text'].head()

0    [energy, economics, science, economics, social...
1    [identity, difference, social, science, societ...
2    [single, variable, calculus, mathematics, diff...
3    [libertarianism, history, humanity, history, s...
4    [introductory, biology, science, health, medic...
Name: cleaned_text, dtype: object

In [62]:
# Corpus = list of tokenized sentences (already cleaned)
corpus = data['cleaned_text'].tolist()
print(corpus[0])

# Train Word2Vec
model = Word2Vec(
    sentences=corpus,
    vector_size=40,
    window=5,       # Larger window for broader context
    min_count=5,    # Ignore very rare words
    workers=4,
    epochs=10
)

['energy', 'economics', 'science', 'economics', 'social', 'science']


In [63]:
# Get a list of the document embedding vector for each sentence in the cleaned text data. The indices will be aligned with the original course rows in dataframe
document_embeddings = [get_document_embedding(doc, model)
                      for doc in data['cleaned_text']]
print(f'list of sentence vectors/sentences: {len(document_embeddings)}')
print(f'each sentence has {document_embeddings[0].shape} dimensions')

list of sentence vectors/sentences: 4338
each sentence has (40,) dimensions


In [65]:
# User interface (abstracted away)
user_input = "data and python"
recommendations = recommend_courses(user_input, document_embeddings, data)
recommendations.head()

Unnamed: 0,name,topic,link,provider
590,用Python玩转数据 Data Processing Using Python,"Computer Programming, Python Programming, Stat...",https://www.coursera.org/learn/hipython,Coursera - Nanjing University
113,Data Science: R Basics,Build a foundation in R and learn how to wrang...,https://pll.harvard.edu/course/data-science-r-...,Harvard University
71,Statistics and R,An introduction to basic statistical concepts ...,https://pll.harvard.edu/course/statistics-and-r,Harvard University
596,Базы данных и SQL в обработке и анализе данных,"Data Management, Databases, SQL, Statistical P...",https://www.coursera.org/learn/sql-data-scienc...,Coursera - IBM Skills Network
174,Python Data Structures,The second course in Python for Everybody expl...,https://www.edx.org/course/python-data-structures,edX - The University of Michigan


# Important Notes

* Original Source: https://www.kaggle.com/code/shtrausslearning/nlp-edx-course-recommendations
* DeepSeek AI for original base which also calls the document embeddings for the input
* I have added comments to better study and understand the code as a base to build off of
* Instead of directly matching to a course index in the dataset which limits the use of the model

# Todo (to better understand and be able to present on this topic)

* Study cosine similarity\
* Word2Vec and stopwords/lemmatization
* Coursera, futurelearn, udemy
* Build a frontend for the app

# Improvements

reshaped corpus as list instead of series, increased context window, 