In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multi-platform-online-courses-dataset/edx.csv
/kaggle/input/multi-platform-online-courses-dataset/skillshare.csv
/kaggle/input/multi-platform-online-courses-dataset/Udemy.csv
/kaggle/input/multi-platform-online-courses-dataset/Coursera.csv
/kaggle/input/edx-courses-dataset-2021/EdX.csv
/kaggle/input/dataset-of-1200-coursera-courses/edx.csv
/kaggle/input/dataset-of-1200-coursera-courses/Barkeley_extension.csv
/kaggle/input/dataset-of-1200-coursera-courses/Oxford.csv
/kaggle/input/dataset-of-1200-coursera-courses/Stanford.csv
/kaggle/input/dataset-of-1200-coursera-courses/udacity.csv
/kaggle/input/dataset-of-1200-coursera-courses/alison.csv
/kaggle/input/dataset-of-1200-coursera-courses/MIT ocw.csv
/kaggle/input/dataset-of-1200-coursera-courses/london school of economics.csv
/kaggle/input/dataset-of-1200-coursera-courses/coursera_update.csv
/kaggle/input/dataset-of-1200-coursera-courses/pluralsight.csv
/kaggle/input/dataset-of-1200-coursera-courses/futurelearn.csv
/kaggle/i

In [30]:
# Install NLTK
!pip list | grep nltk

import nltk

nltk.download('punkt')  
nltk.download('wordnet')  

# Unzip per this stackoverflow: https://stackoverflow.com/questions/73849624/getting-error-while-submitting-notebook-on-kaggle-even-after-importing-nltk-libr
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

nltk                               3.2.4
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [3]:
# Setup

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# Clean text 
def clean_text(text):
    lemma = WordNetLemmatizer() # lemmatizer
    text = re.sub("[^A-Za-z1-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text) # look into this tokenization
    tokens = [lemma.lemmatize(word) for word in tokens # lemmatize words and remove stopwords 
                if word not in stopwords.words("english")]
    return tokens


# Get the sentence embeddings for each course and user input with this function
# First get the word embeddings and average them out for the sentence (aka course/input)
# overall embedding

def get_document_embedding(doc, model):
    embeddings = [model.wv[word] for word in doc if word in model.wv] # Get individual embeddings into a list
    # Consider implementing exception handling 
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0) #How does np.mean work
    else:
        return np.zeros(model.vector_size)

# Use previous functions to process user input into vector and use cosine 
# Similarity to find the most related courses
def recommend_courses(user_input, document_embeddings, data, top_n=5):
    cleaned_input = clean_text(user_input)
    input_embedding = get_document_embedding(cleaned_input, word2vec_model)
    similarities = cosine_similarity([input_embedding], document_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    recommendations = data.iloc[top_indices][['name', 'university', 'difficulty level', 'course description', 'link']]
    return recommendations

        

In [49]:
# Experiment with other data
# edx, coursera, harvard, mit ocw
# https://sparkbyexamples.com/pandas/pandas-read-multiple-csv-files/#:~:text=Load%20each%20file%20into%20individual,each%20file%20individually%20if%20needed.

# Normalize/clean course data to the name, topic, link, text format for now

dataMit = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/MIT ocw.csv")
dataMit.columns = map(str.lower, dataMit.columns)
dataMit.rename(columns={'name ': 'name'}, inplace=True)
dataMit = dataMit.drop(columns=['sub category', 'category', 'resource type', 'course code', 'instructure'])
dataMit.rename(columns={'course link': 'link'}, inplace=True)
dataMit['text'] = dataMit['name'] + " " + dataMit['topic'] 
print(column_names)

dataMit = dataMit[['name', 'topic', 'link']]
print(dataMit.head())


dataHarvard = pd.read_csv("/kaggle/input/dataset-of-1200-coursera-courses/Harvard_university.csv")
dataHarvard.columns = map(str.lower, dataHarvard.columns)
dataHarvard.rename(columns={'link to course': 'link', 'about': 'topic'}, inplace=True)
# Remove rows where the price is not 'Free' (gpt code)
dataHarvard = dataHarvard[dataHarvard['price'] == 'Free']
# Select only the columns 'name', 'topic', and 'link'
dataHarvard = dataHarvard[['name', 'topic', 'link']]
dataHarvard['text'] = dataHarvard['name'] + " " + dataHarvard['topic'] 
# print(dataHarvard.head())

# Coursera data

# Read in 2021 EdXdata
dataEdx = pd.read_csv("/kaggle/input/edx-courses-dataset-2021/EdX.csv")
dataEdx.columns = map(str.lower, dataEdx.columns)
dataEdx["topic"] = dataEdx['about'] + '. ' + dataEdx['course description']
dataEdx = dataEdx[['name', 'topic', 'link']]
dataEdx['text'] = dataEdx['name'] + " " + dataEdx["topic"]
# dataEdx.head()

# Read in 

link, name, topic, text
                        name                                          topic  \
0           Energy Economics             Science, Economics, Social Science   
1    Identity and Difference          Social Science, Society, Anthropology   
2   Single Variable Calculus  Mathematics, Differential Equations, Calculus   
3  Libertarianism in History            Humanities, History, Social Science   
4       Introductory Biology          Science, Health and Medicine, Biology   

                                                link  
0  https://ocw.mit.edu/courses/14-44-energy-econo...  
1  https://ocw.mit.edu/courses/21a-218j-identity-...  
2  https://ocw.mit.edu/courses/18-01-single-varia...  
3  https://ocw.mit.edu/courses/21h-181-libertaria...  
4  https://ocw.mit.edu/courses/7-016-introductory...  


In [45]:
# Create a Combined Dataframe
data = pd.concat([dataMit, dataHarvard, dataEdx])
#data.head()

Unnamed: 0,name,topic,link,text
0,Energy Economics,"Science, Economics, Social Science",https://ocw.mit.edu/courses/14-44-energy-econo...,
1,Identity and Difference,"Social Science, Society, Anthropology",https://ocw.mit.edu/courses/21a-218j-identity-...,
2,Single Variable Calculus,"Mathematics, Differential Equations, Calculus",https://ocw.mit.edu/courses/18-01-single-varia...,
3,Libertarianism in History,"Humanities, History, Social Science",https://ocw.mit.edu/courses/21h-181-libertaria...,
4,Introductory Biology,"Science, Health and Medicine, Biology",https://ocw.mit.edu/courses/7-016-introductory...,


In [36]:
# The list of tokenized sentences, ie our Corpus 
data['cleaned_text'] = data['text'].apply(clean_text) # Add clean text column to dataframe
data['cleaned_text'].head()

0    [energy, economics, science, economics, social...
1    [identity, difference, social, science, societ...
2    [single, variable, calculus, mathematics, diff...
3    [libertarianism, history, humanity, history, s...
4    [introductory, biology, science, health, medic...
Name: cleaned_text, dtype: object

In [37]:
# Train the model (look into how this works/can be configured more specifically
# and see the todo list)

word2vec_model = Word2Vec(
    sentences=data['cleaned_text'],
    vector_size=100,
    min_count=1,
)


In [38]:
# Get a list of the document embedding vector for each sentence in the cleaned text data
document_embeddings = [get_document_embedding(doc, word2vec_model)
                      for doc in data['cleaned_text']]
document_embeddings[0]

array([-0.33050957,  1.3129758 , -0.23675087, -0.39530584,  0.32115155,
       -0.28052858,  0.12256752,  1.9098924 , -0.13483056, -0.6778844 ,
        0.4702141 , -1.0808438 , -0.04577271, -0.02055188,  0.18441686,
       -0.00902548,  1.696166  ,  0.0675119 , -0.74802935, -1.2348891 ,
       -0.06763209, -0.7299306 ,  0.91267395, -0.21569045,  0.19937897,
        0.76901245,  0.30119827, -0.3375313 , -0.35949424,  0.9457317 ,
        0.50089926, -0.6765849 , -0.48110262, -0.78745323, -0.3824605 ,
        0.5359365 ,  1.2158784 ,  0.09308362, -0.20864157,  0.27100348,
        1.008581  , -0.40488777, -0.24397121, -0.41719952,  0.04743673,
        0.5546692 , -0.6616823 , -0.4978992 ,  0.39706534,  1.6474438 ,
        0.8744562 , -1.153931  , -0.9011733 , -0.0357539 , -1.4920715 ,
        0.0615241 ,  0.94522655,  0.57195944,  0.14965867, -0.44829467,
       -0.35497734, -0.22344853,  0.9656228 , -0.04121501, -0.50352687,
        0.73420733,  0.2710003 , -0.02785451, -0.6998839 ,  0.40

In [42]:
# User interface (abstracted away)
user_input = "cars"
recommendations = recommend_courses(user_input, document_embeddings, data)
recommendations.head()

Unnamed: 0,name,university,difficulty level,course description,link
45,Energy Within Environmental Constraints,Harvard University,Beginner,Humanity faces an immense challenge: providing...,https://www.edx.org/course/energy-within-envir...
429,Origins of the Human Mind,Kyoto University,Beginner,"The human mind is an evolutionary product, jus...",https://www.edx.org/course/origins-of-the-huma...
37,Boosting Your Energy,,,,
380,"One Planet, One Ocean",SDG Academy,Intermediate,Is the ocean the real final frontier? Humans h...,https://www.edx.org/course/one-planet-one-ocean
136,"Literati China: Examinations, Neo-Confucianism...",,,,


# Important Notes

* Original Source: https://www.kaggle.com/code/shtrausslearning/nlp-edx-course-recommendations
* DeepSeek AI for original base which also calls the document embeddings for the input
* I have added comments to better study and understand the code as a base to build off of
* Instead of directly matching to a course index in the dataset which limits the use of the model

# Todo (to better understand and be able to present on this topic)

* Study cosine similarity\
* Word2Vec and stopwords/lemmatization