In [None]:
pip install -U kaleido

In [None]:
#importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


![](https://i.imgur.com/NVrbtzj.jpg)


## <b>1 <span style='color:#15C3BA'>|</span> BACKGROUND</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>

- In this section we introduce the problem
- Talk about where our data is from
- What we would like to achieve in this notebook
   

### <b><span style='color:#15C3BA'> 1.1 |</span> Problem Formulation/Statement</b> 

- With the world becoming digital, any new skill can be acquired with just a click. However, many of us still needs a dedicated curriculum in order to excel in a specific topic
- This is where e-learning platforms comes handy and EdX is one of such massive open online course (MOOC) providers
- So we've found a course we like, and went through the course, so what next?
- With the availability of so many online courses, it may be take some effort and time to look through all available courses
- We can utilise a recommendation system to give some tips on what course the user might like to go though next
- Whilst there are quite a number approaches to recommendation systems, well utilise an approach which requires NLP

<br>

### <b><span style='color:#15C3BA'> 1.2 |</span> Recommendation system</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>GOALS</span></b></p></div>

The purpose of our recommendation system is to inform a user about possible courses they make like, based on a couse they liked

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>METHOD</span></b></p></div>

We will utilise scrapped course description data (our corpus), well convert each document into vector form using (bow,embeddings), then calculate the consine similarity, from which we will be able to extract courses which are most similar 

<br>

### <b><span style='color:#15C3BA'> 1.3 |</span> The Dataset</b> 

- This dataset is scraped off the publicly available information on the **EdX** website
- This dataset consists of 720 rows and 6 columns namely Name of the Course, Name of the University, Difficulty Level, Course URL, short summary about the course and course description

What is edX?

> edX online courses are self-paced, interactive courses offered by leading universities and organizations around the world. These courses provide learners with a range of topics to explore and learn from, including computer science, business, health, engineering, humanities, and more. With edX courses, learners can gain valuable skills and knowledge in an engaging and convenient way.

![](https://i.imgur.com/3oYY48C.jpg)

<br>

### <b><span style='color:#15C3BA'> 1.4 |</span> Notebook Goals</b> 

Two subgoals are of interest:

- **EDA study** | Analyse an draw conclusions based on the courses that are available
- **Course Recommendation system** | Create a course recommendation based on a specified course 

## <b>2 <span style='color:#15C3BA'>|</span> idX DATASET</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>
    
- We'll read the data <code>EdX.csv</code>
- Lower the register of column names
- Show for one course the name, about & description 
    
    
### <b><span style='color:#15C3BA'> 2.1 |</span> Read Data</b> 

Read our data and store it in <code>data</code> & make slight adjustment to column names


In [None]:
# read data
data = pd.read_csv('/kaggle/input/edx-courses-dataset-2021/EdX.csv')
data.columns = map(str.lower, data.columns)
data.head()

For the **corpus** we will utilise three columns **name**, **about** & **course description**

In [None]:
def show_course_id(id):
    print(f"Course Name:\n{data['name'][id]}",'\n')
    print(f"What is the course about?:\n{data['about'][id]}",'\n')
    print(f"Course Description:\n{data['course description'][id]}")
    #see link
    print(f"Course Link:\n{data['link'][id]}")
    
# Show example of course description contents
print('Sample from Dataset:\n')
show_course_id(50)

### <b><span style='color:#15C3BA'> 2.2 |</span> Dataset Features</b> 

The features we'll be working with:

- <code>Name</code> Course name
- <code>University</code> University which offers the course
- <code>Difficulty Level</code> The extend of how difficult the course is evaluated to be
- <code>Link</code> HTTP link to course 
- <code>About</code> A description of what the course is about

### <b><span style='color:#15C3BA'> 2.3 |</span> Type Data / Missing Data</b> 
- All data is present & data is in <code>object</code> type

In [None]:
data.info()

### <b><span style='color:#15C3BA'> 2.4 |</span> Duplicate Rows</b> 

- We only have one case of identical row data, lets drop it

In [None]:
print(f'number of duplicate rows: {data.duplicated().sum()}')
data.drop_duplicates(inplace = True)

data.reset_index(drop = True, inplace = True)
print(f'number of rows: {data.shape[0]}')

## <b>3 <span style='color:#15C3BA'>|</span> EXPLORATORY DATA ANALYSIS</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>


Let's ask ourselves some questions:
- How many unique institutions are offering courses?
- Which institutions offers the most courses, which offer the least?
- What is the distribution of difficulty level of all courses?
- What is the distribution of difficulty level of all courses for a particular institution?

In [None]:
print(f"Number of unique institutions offering courses: {len(list(data['university'].value_counts().index))}")

In [None]:
display(data['university'].value_counts().head())
display(data['university'].value_counts().tail())

In [None]:
data['difficulty level'].value_counts()

In [None]:
data[data['university'].str.startswith('D')]['university'].value_counts()

df_rename = {'Delft University of Technology-Wageningen University & Research-Delft University & Wageningen University':'Deft'}

In [None]:
data['university'] = data['university'].rename(df_rename)

In [None]:
ldf = data.groupby(['difficulty level','university'],as_index=False).size()
ldf = ldf.sort_values(by='size',ascending=False)

fig = px.bar(ldf,y='university',x='size',color='difficulty level',height=900,template='plotly_white')
fig.show()

### <b><span style='color:#15C3BA'> 3.1 |</span> n-gram of course description</b> 

Lets check most common ngrams in description

In [1]:
# importing the dependencies needed for pre processing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

en_stopwords = stopwords.words("english") # stop words 
lemma = WordNetLemmatizer() # lemmatiser

# define a function for preprocessing
def clean(text):
    text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
    text = text.lower() #changes to lower case
    tokens = word_tokenize(text) #tokenize the text
    clean_list = [] 
    for token in tokens:
        if token not in en_stopwords: #removes stopwords
            clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
    return " ".join(clean_list)# joins the tokens

# applying the "clean" function on the text column
ldata = data['course description'].apply(clean)

NameError: name 'data' is not defined

In [None]:
import spacy
from collections import Counter
import plotly.express as px

nlp = spacy.load('en_core_web_sm')

dict_ngrams = {'unigram':[],'bigram':[],'trigram':[]}
for document in ldata:

    doc = nlp(document)
    tokens = [token.text for token in doc]

    def n_grams(tokens,n):
        lst_bigrams = [' '.join(i) for i in [tokens[i:i+n] for i in range(len(tokens)-n+1)]]
        return lst_bigrams

    dict_ngrams['unigram'].extend(n_grams(tokens,1))
    dict_ngrams['bigram'].extend(n_grams(tokens,2))
    dict_ngrams['trigram'].extend(n_grams(tokens,3))
    
print('unigrams',len(dict_ngrams['bigram']))
print('bigrams',len(dict_ngrams['unigram']))
print('trigrams',len(dict_ngrams['trigram']))

# plot ngrams
def plot_counter(counter,top,name):
    labels, values = zip(*counter.items())
    fig = px.bar(pd.Series(values,index=labels,name=name).sort_values(ascending=False)[:top],
                 template='plotly_white',orientation='h')
    fig.show('svg',dpi=300)

In [None]:
plot_counter(Counter(dict_ngrams['unigram']),20,'unigram')

In [None]:
plot_counter(Counter(dict_ngrams['bigram']),20,'bigrams')

In [None]:
plot_counter(Counter(dict_ngrams['trigram']),20,'trigrams')

## <b>4 <span style='color:#15C3BA'>|</span> NATURAL LANGUAGE PROCESSING</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>

- Remove irrelovant columns in our data that won't be utilised in this study
- Create a new columns **text**, which will be used in our analysis 
- Do some text cleaning & stemming of the **text** column data
- Prepare the data for both **TF-IDF** & **Word2Vec**, which require slightly different inputs

### <b><span style='color:#15C3BA'> 4.1 |</span> Drop Irrelovant Rows</b> 

Let's remove column data we will not utilise

In [None]:
data.drop(columns = ["university", "difficulty level"], 
          axis =1, inplace = True)
df = data.copy()

### <b><span style='color:#15C3BA'> 4.2 |</span> Adjust Description</b> 

- Create a documents which will be comprised of the **course name**, **about** & **course description**
- We will be utilising this as our corpus data we will feed into **TF-IDF** & **Word2Vec** models
- `data['text']` will be our corpus

In [None]:
data['text'] = data['name'] + ' ' + data['about'] + ' ' + data['course description']
data.head(5)

In [None]:
text_data = data[['name','about','course description','text']]
text_data.to_csv('text_data.csv',index=False)

### <b><span style='color:#15C3BA'> 4.3 |</span> Text Cleaning / Stemming</b> 

Why is cleaning text important?

- Text cleaning is an important step in NLP because it generally helps improve the accuracy of machine learning algorithms
- With the removal of stop words (eg. and) and other unneccessary elements in a sentence, 
- It can help reduce the noise in the data & help improve the quality of the input data that is been fed into the the model
- Additionally, it can help reduce the size of the vocabulary and size of the input data, making it faster to process

Converting input words to base dictionary form:

- As with text cleaning, it also help to utilise word stemmers
- They help machine learning algorithms by normalising words to their root/dictionary form
- What this does it help reduce the number of words (eg. player, playing, played -> play)
- This can further improve the accuracy of the algorithms by allowing them to identify patterns in the data
- Training time & computational resources, as a result can also be reduced

In [None]:
#importing the dependencies needed for pre processing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
en_stopwords = stopwords.words("english") # stop words 
lemma = WordNetLemmatizer() # lemmatiser

# define a function for preprocessing
def clean(text):
    text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
    text = text.lower() #changes to lower case
    tokens = word_tokenize(text) #tokenize the text
    clean_list = [] 
    for token in tokens:
        if token not in en_stopwords: #removes stopwords
            clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
    return " ".join(clean_list)# joins the tokens

# applying the "clean" function on the text column
data.text = data.text.apply(clean)
data.text

- Let's clean & stem the pandas data frame corpus **text**

In [None]:
# Preprocessing, returns list instead
def clean_for_word2vec(text):
    
    text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
    text = text.lower() #changes to lower case
    tokens = word_tokenize(text) #tokenize the text
    clean_list = [] 
    for token in tokens:
        if token not in en_stopwords: #removes stopwords
            clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
    return clean_list

#cleaning the documents
corpus_cleaned = data.text.apply(clean_for_word2vec)
lst_corpus = corpus_cleaned.tolist()

In [None]:
corpus = []
for words in data['text']:
    corpus.append(words.split())
    
len(f'corpus length: {corpus}')

## <b>5 <span style='color:#15C3BA'>|</span> COURSE RECOMMENDATIONS</b> 

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>WHAT WE WILL DO IN THIS SECTION</span></b></p></div>
    
- Our approach to providing recommendations is based on **cosine similarity** of input vectors
- The first approach we can utilise to generate vectors for each course is by utilising **Term Frequency-Inverse Document Frequency** (TF-IDF)
- The second approach we can utilise to generate vectors for each course is by utilising **Embedding Vectors** 
    
<br>
    
### <b><span style='color:#15C3BA'> 5.1 |</span> TF-IDF</b> 
    
- First let's try utilising TF-IDF for the generation of vector representation for  our documents in the corpus <code>data['text']</code>
- It's sufficient to input the **pandas series** into <code>TfidVectorizer</code>

In [None]:
# course names
lst_names = list(data['name'])
lst_names[:20]

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>GENERATION OF VECTOR REPRESENTATION OF TEXT</span></b></p></div>

- TF-IDF was described in notebook **[nlp | Natural Language Processing Reference](https://www.kaggle.com/code/shtrausslearning/nlp-natural-language-processing-reference)**
- <code>test_matrix</code>, is input into our recommendation generation function <code>Recommendation_Cosine_similarity</code>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print(f"fitting data on:\n{data['text'][:3]} of type: {type(data['text'])}")

vectoriser = TfidfVectorizer()
test_matrix = vectoriser.fit_transform(data['text'])
print(f'\noutput matrix size: {test_matrix.shape}')
print(f'length of the vectoriser vocabulary: {len(vectoriser.vocabulary_)}')

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>GENERATION OF VECTOR REPRESENTATION OF TEXT</span></b></p></div>

Example recommendation for: **MathTrackX: Differential Calculus**

> - 1 MathTrackX: Differential Calculus
> - 2 MathTrackX: Integral Calculus
> - 3 MathTrackX: Statistics
> - 4 MathTrackX: Polynomials, Functions and Graphs
> - 5 MathTrackX: Probability
> - 6 MathTrackX: Special Functions




In [None]:
# define a function that will return the first five recommended courses
def Recommendation_Cosine_similarity(matrix, name):
    
    # get its index from list
    row_num = lst_names.index(name)
     
    # cosine similarity matrix for each index in list (square matrix)
    similarity = cosine_similarity(test_matrix)
    
    # get similar courses by highest cosine similarity
    similar_courses = list(enumerate(similarity[row_num]))
    sorted_similar_courses = sorted(similar_courses, key=lambda x:x[1], reverse= True)[:6]
    
    print(f'recommended courses for {name}\n')
    # This part will return the description of the recommended courses
    i = 0
    for item in sorted_similar_courses:
        course_description = data[data.index == item[0]]["name"].values[0]
        recommendations = print(f"{i+1} {course_description}")
        i = i + 1
    return recommendations

Recommendation_Cosine_similarity(test_matrix,'MathTrackX: Differential Calculus')

### <b><span style='color:#15C3BA'> 5.2 |</span> Word2Vec w/ Gensim</b>

Gensim is a very useful library, let's look at some basic aspects of it, hopefully you will quickly grasp how to use it

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>EMBEDDING USAGE EXAMPLES</span></b></p></div>

- Load GoogleNews embedding vectors (already vectors, not model.wv -> just model gives **embedding vectors**)
- Existing vectors arent trainable & can only be used as it is for embedding extraction of specific words
- We can preload existing word embedding vectors from files (eg. GoogleNews) using <code>KeyedVectors</code> 
- Which is not to be confused with <code>Word2Vec</code>, before version 4, we could have utilised pretrained KeyedVectors by merging them with <code>Word2Vec</code>

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# preloaded embedding vectors
# word2vec_path = '/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
# w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
# '''
#               Some operations we can do with KeyedVectors
# '''

# result = w2v_model.most_similar(positive=['woman', 'king'], negative=['man'])
# print(result)

# result = w2v_model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
# print(result)

# print(w2v_model.doesnt_match("breakfast cereal dinner lunch".split()))

# similarity = w2v_model.similarity('woman', 'man')
# print(similarity)

# result = w2v_model.similar_by_word("cat")
# print(result)

# # embedding vectors

# vector = w2v_model['computer']  # numpy vector of a word
# # print(vector[:10])

# vector = w2v_model.get_vector('office', norm=True)

# for i in vector[:10]:
#     print(i)
# # print(vector[:10])

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>TRAINING OUR WORD2VEC MODEL</span></b></p></div>

- Unlike <code>KeyedVectors</code>, we can train the <code>Word2Vec</code> model
- LEt's create word embedding vectors for each word in the corpus **text**

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(vector_size=100,min_count=1)
model.build_vocab(corpus)
print(f"words in corpus: {model.corpus_total_words}")
print(f'corpus count: {model.corpus_count}')
model.train(corpus, total_examples = model.corpus_count, epochs = 50)
model.save('embeddings')

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>TESTING THE MODEL</span></b></p></div>

- As we did above in the <code>KeyedVectors</code> part, let's check some relations between different words 

In [None]:
vocab_len = len(model.wv)
print(f'Vocabulary size: {vocab_len}')

print('First 10 words in vocabulary:')
key_vocab = model.wv.index_to_key[:10]
print(key_vocab)

In [None]:
result = model.wv.similar_by_word('deep')
for i in result:
    print(i)

word1 = 'deep'; word2 = 'learning'
similarity = model.wv.similarity(word1,word2)
print(f'\nsimilarity b/w {word1} and {word2} {round(similarity,2)}\n')

# embedding vectors

vector = model.wv['computer']  # numpy vector of a word
print(f'computer word embedding')
print(f'first {10} components')
print(vector[:10])

In [None]:
# View similar words based on gensim's model
print('Similar Words')
similar_words = {search_term: [item[0] for item in model.wv.most_similar([search_term], topn=5)]
                  for search_term in key_vocab}
similar_words

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>VISUALISATION EMBEDDING VECTORS</span></b></p></div>

- Embedding vectors encode information about a word in a very high dimensional space, let's utilise <code>TSNE</code>, to reduce the number of dimensions
- The resulting two dimensional array data can then be visualised, so we can understand similarity between words embeddings

In [None]:
# Lower dimensionality visualisation of embeddings (100->2)
import plotly.express as px
from sklearn.manifold import TSNE
import warnings; warnings.filterwarnings('ignore')

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = model.wv[words]

tsne = TSNE(n_components=2, 
            random_state=0, 
            n_iter=10000)

X = tsne.fit_transform(wvs)
labels = words
    
fig = px.scatter(X[:, 0], X[:, 1],text=labels,
           template='plotly_white',
           width=800,
           title='Word Embedding Visualisation')
fig.show('svg',dpi=300)

<div style="color:white;display:fill;border-radius:8px;font-size:100%; letter-spacing:1.0px;"><p style="padding: 5px;color:white;text-align:left;"><b><span style='color:#15C3BA'>AVERAGE SENTENCE EMBEDDINGS</span></b></p></div>

- Each document is made up of multiple tokens, we can utilise a commonly used approach to merge the embedding vectors
- The combined embeddings will create a single sentence embedding, which will still make sense

In [None]:
# Get average embedding vector for each text
def doc_vectorizer(doc, model):
    
    doc_vector = []
    num_words = 0
    
    for word in doc:
        try:
            if num_words == 0:
                doc_vector = model.wv[word]
            else:
                doc_vector = np.add(doc_vector, model.wv[word])
            num_words += 1
        except:
            pass  # if embedding vector isn't found
     
    return np.asarray(doc_vector) / num_words

X = []
for doc in lst_corpus:
    X.append(doc_vectorizer(doc,model))
    
print(f'list of sentence vectors/sentences: {len(X)}')
print(f'each sentence has {X[0].shape} dimensions')

### <b><span style='color:#15C3BA'> 5.3 |</span> Recommendation using cosine similarity</b>

Example recommendation for: **Data Science: Inference and Modeling**

> - Introduction to Statistical Methods for Gene Mapping
> - Data Science: Inferential Thinking through Simulations
> - Data Science: R Basics
> - Probability and Statistics in Data Science using Python
> - Data Science: Linear Regression

In [None]:
def course_recommender(X,course):

    # Finding cosine similarity for the vectors
    cosine_similarities = cosine_similarity(X,X)

    # Taking the Title and Movie Image Link and store in new dataframe called 'movies'
    courses = data[['name']]

    # Reverse mapping of the index
    indices = pd.Series(data.index, index = data['name']).drop_duplicates()

    idx = indices[course]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommend = courses.iloc[movie_indices]

    for index, row in recommend.iterrows():
        print(row['name'])
        
course_recommender(X,'Data Science: Inference and Modeling')

## <b>6 <span style='color:#15C3BA'>|</span> PUTTING TOGETHER RECOMMENDER CLASS</b> 

Lets put together everything above into a single class, so the its much more userfriends and organised

In [None]:
import pandas as pd

corpus = pd.read_csv('/kaggle/input/edx-textdata/text_data.csv')
corpus.head()

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#get corpus as text_data.csv

class recommender:
    
    def __init__(self,corpus):
        self.corpus = corpus # corpus dataframe
        self.course_names = list(data['name']) # course names only

        
    '''
    
    Text Clearning for Corpus
    
    '''
        
    @staticmethod
    def clean_for_word2vec(text):

        text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
        text = text.lower() #changes to lower case
        tokens = word_tokenize(text) #tokenize the text
        clean_list = [] 
        for token in tokens:
            if token not in en_stopwords: #removes stopwords
                clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
        return clean_list
    
    @staticmethod
    def clean(text):
        text = re.sub("[^A-Za-z1-9 ]", "", text) #removes punctuation marks
        text = text.lower() #changes to lower case
        tokens = word_tokenize(text) #tokenize the text
        clean_list = [] 
        for token in tokens:
            if token not in en_stopwords: #removes stopwords
                clean_list.append(lemma.lemmatize(token)) #lemmatizing and appends to clean_list
        return " ".join(clean_list)# joins the tokens

    
    # create average emebedding vector for sentence
    
    @staticmethod
    def doc_vectorizer(doc, model):

        doc_vector = []
        num_words = 0

        for word in doc:
            try:
                if num_words == 0:
                    doc_vector = model.wv[word]
                else:
                    doc_vector = np.add(doc_vector, model.wv[word])
                num_words += 1
            except:
                pass  # if embedding vector isn't found

        return np.asarray(doc_vector) / num_words
    
    
    # cosine similarity for tfidf
    
    def course_recommender_tfidf(self,matrix, name):

        # get its index from list
        row_num = self.course_names.index(name)

        # cosine similarity matrix for each index in list (square matrix)
        similarity = cosine_similarity(test_matrix)

        # get similar courses by highest cosine similarity
        similar_courses = list(enumerate(similarity[row_num]))
        sorted_similar_courses = sorted(similar_courses, key=lambda x:x[1], reverse= True)[:6]

        print(f'Recommended courses for \n{name}\n')
        # This part will return the description of the recommended courses
        i = 0
        for item in sorted_similar_courses:
            course_description = self.corpus[self.corpus.index == item[0]]["name"].values[0]
            recommendations = print(f"{i+1} {course_description}")
            i = i + 1
        return recommendations
    
    # cosine similarity for word2vec

    def course_recommender_w2v(X,course): #w2v would be better because it goes beyond direct keyword matching and captures semantic meaning. cons are
        #requires a large corpus to be trained on which is alleviated because we use the google word2vec dataset

        # Finding cosine similarity for the vectors
        cosine_similarities = cosine_similarity(X,X)

        # Taking the Title and Movie Image Link and store in new dataframe called 'movies'
        courses = data[['name']]

        # Reverse mapping of the index
        indices = pd.Series(data.index, index = data['name']).drop_duplicates()

        idx = indices[course]
        sim_scores = list(enumerate(cosine_similarities[idx]))
        sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
        sim_scores = sim_scores[1:6]
        movie_indices = [i[0] for i in sim_scores]
        recommend = courses.iloc[movie_indices]

        for index, row in recommend.iterrows():
            print(row['name'])
            
    def tfidf_recommend(self,course):
        
        vectoriser = TfidfVectorizer()
        self.test_matrix = vectoriser.fit_transform(self.corpus['text'])
        self.course_recommender_tfidf(test_matrix,course)
        
        
    # input course currently taking
            
    def __call__(self,inputs):
        
        recommendations = self.tfidf_recommend(inputs)

# input corpus
recomm = recommender(corpus)
recomm('Roblox')