In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [31]:
# Install NLTK
!pip list | grep nltk

import nltk

nltk.download('punkt')  
nltk.download('wordnet')  

# Unzip per this stackoverflow: https://stackoverflow.com/questions/73849624/getting-error-while-submitting-notebook-on-kaggle-even-after-importing-nltk-libr
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

nltk                               3.2.4
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
 

In [32]:
# Setup

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re




# Clean text 
def clean_text(text):
    lemma = WordNetLemmatizer() # lemmatizer
    text = re.sub("[^A-Za-z1-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text) # look into this tokenization
    tokens = [lemma.lemmatize(word) for word in tokens # lemmatize words and remove stopwords 
                if word not in stopwords.words("english")]
    return tokens


# Get the sentence embeddings for each course and user input with this function
# First get the word embeddings and average them out for the sentence (aka course/input)
# overall embedding

def get_document_embedding(doc, model):
    embeddings = [model.wv[word] for word in doc if word in model.wv] # Get individual embeddings into a list
    # Consider implementing exception handling 
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0) #How does np.mean work
    else:
        return np.zeros(model.vector_size)

# Use previous functions to process user input into vector and use cosine 
# Similarity to find the most related courses
def recommend_courses(user_input, document_embeddings, data, top_n=5):
    cleaned_input = clean_text(user_input)
    input_embedding = get_document_embedding(cleaned_input, word2vec_model)
    similarities = cosine_similarity([input_embedding], document_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    recommendations = data.iloc[top_indices][['name', 'university', 'difficulty level', 'course description', 'link']]
    return recommendations

        



In [33]:
# Read in data
data = pd.read_csv("/kaggle/input/edx-courses-dataset-2021/EdX.csv")
data.columns = map(str.lower, data.columns)
data['text'] = data['name'] + " " + data['about'] + ' ' + data['course description']
data.head()

Unnamed: 0,name,university,difficulty level,link,about,course description,text
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t...",How to Learn Online Learn essential strategies...
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...,Programming for Everybody (Getting Started wit...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct...",CS50's Introduction to Computer Science An int...
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab...",The Analytics Edge Through inspiring examples ...
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...,Marketing Analytics: Marketing Measurement Str...


In [34]:
# The list of tokenized sentences, ie our Corpus 
data['cleaned_text'] = data['text'].apply(clean_text) # Add clean text column to dataframe
data['cleaned_text'].head()

WordNet directory exists.
Contents: ['abc', 'crubadan.zip', 'cess_esp.zip', 'europarl_raw', 'omw.zip', 'gutenberg.zip', 'opinion_lexicon', 'conll2000.zip', 'conll2002.zip', 'senseval.zip', 'twitter_samples.zip', 'inaugural.zip', 'wordnet_ic', 'biocreative_ppi', 'product_reviews_1', 'sentence_polarity.zip', 'indian.zip', 'crubadan', 'biocreative_ppi.zip', 'rte.zip', 'twitter_samples', 'problem_reports', 'sinica_treebank', 'movie_reviews', 'paradigms.zip', 'pl196x.zip', 'gazetteers', 'stopwords.zip', 'verbnet.zip', 'reuters.zip', 'mac_morpho', 'names.zip', 'shakespeare.zip', 'alpino.zip', 'udhr2', 'opinion_lexicon.zip', 'brown.zip', 'cess_cat', 'stopwords', 'qc', 'smultron.zip', 'masc_tagged.zip', 'genesis', 'unicode_samples', 'verbnet', 'floresta.zip', 'udhr', 'conll2000', 'chat80', 'movie_reviews.zip', 'ppattach', 'machado.zip', 'treebank', 'ppattach.zip', 'problem_reports.zip', 'inaugural', 'toolbox.zip', 'floresta', 'pil', 'kimmo.zip', 'swadesh.zip', 'mte_teip5', 'ieer', 'switchboard

0    [learn, online, learn, essential, strategy, su...
1    [programming, everybody, getting, started, pyt...
2    [cs5s, introduction, computer, science, introd...
3    [analytics, edge, inspiring, example, story, d...
4    [marketing, analytics, marketing, measurement,...
Name: cleaned_text, dtype: object

In [35]:
# Train the model (look into how this works/can be configured more specifically
# and see the todo list)

word2vec_model = Word2Vec(
    sentences=data['cleaned_text'],
    vector_size=100,
    min_count=1,
)


In [37]:
# Get a list of the document embedding vector for each sentence in the cleaned text data
document_embeddings = [get_document_embedding(doc, word2vec_model)
                      for doc in data['cleaned_text']]
document_embeddings[0]

array([-0.31738716,  0.48221073,  0.32072747,  0.05004184,  0.32212418,
       -0.7523945 ,  0.3692435 ,  1.0491297 , -0.33893737, -0.20738848,
       -0.19032927, -0.93290585,  0.12122314,  0.25934523,  0.26434666,
       -0.36828208,  0.0485232 , -0.36793157, -0.04782065, -0.9747415 ,
        0.15891145,  0.4775673 ,  0.01029137, -0.21422265,  0.08689366,
       -0.07887888, -0.17891529, -0.15225834, -0.55370545, -0.1017397 ,
        0.36613256,  0.16260162,  0.15690361, -0.4155114 , -0.2669959 ,
        0.37472662,  0.09390941, -0.6887385 , -0.23846349, -0.93619806,
        0.02015074, -0.40673998, -0.2684796 , -0.05975384,  0.4403232 ,
       -0.09997371, -0.3158841 ,  0.09628653,  0.18577877,  0.43670642,
        0.3757107 , -0.48167124, -0.3412305 , -0.06216598, -0.2623549 ,
        0.18438762,  0.32201773,  0.10410967, -0.3934521 ,  0.3219823 ,
        0.02335353,  0.00436224, -0.16847838, -0.02536898, -0.44287097,
        0.6097624 ,  0.1915251 ,  0.29961735, -0.53503144,  0.55

In [41]:
# User interface (abstracted away)
user_input = "video game development"
recommendations = recommend_courses(user_input, document_embeddings, data)
recommendations.head()

Unnamed: 0,name,university,difficulty level,course description,link
661,Sustainable Development: The Post-Capitalist O...,SDG Academy,Beginner,Our present system of global capitalism is fla...,https://www.edx.org/course/sustainable-develop...
493,How to Achieve the Sustainable Development Goals,SDG Academy,Intermediate,"On September 25, 2015, the United Nations Gene...",https://www.edx.org/course/how-to-achieve-the-...
718,Cities and the Challenge of Sustainable Develo...,SDG Academy,Beginner,"According to the United Nations, urbanization ...",https://www.edx.org/course/cities-and-the-chal...
255,Sustainable Cities,SDG Academy,Intermediate,Did you know that experts estimate an addition...,https://www.edx.org/course/sustainable-cities-2
13,Ethics in Action,SDG Academy,Intermediate,The challenges of sustainable development are ...,https://www.edx.org/course/ethics-in-action


# Important Notes

* Original Source: https://www.kaggle.com/code/shtrausslearning/nlp-edx-course-recommendations
* DeepSeek AI for original base which also calls the document embeddings for the input
* I have added comments to better study and understand the code as a base to build off of
* Instead of directly matching to a course index in the dataset which limits the use of the model

# Todo (to better understand and be able to present on this topic)

* Study cosine similarity\
* Word2Vec and stopwords/lemmatization