# Load all courses data

In [1]:
import pandas as pd

In [2]:
# Reading all courses data
all_courses_df = pd.read_csv('./all_courses_data.csv')

In [3]:
all_courses_df.head()

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled
0,ps_1,Web Development: Executive Briefing,https://www.pluralsight.com/courses/web-develo...,Brice Wilson,4.6,0.500556,Tech leaders need a fundamental understanding ...,1,158.0,
1,ps_2,Front End Web Development: Get Started,https://www.pluralsight.com/courses/front-end-...,Joe Eames,4.6,3.799167,Front end web development involves many differ...,1,1250.0,
2,ps_3,Beyond ASP.NET MVC: Modern Web Development Dem...,https://www.pluralsight.com/courses/beyond-asp...,Chris Jones,4.6,1.8425,The web development landscape is constantly ch...,1,75.0,
3,ps_4,Tactics and Tools for Troubleshooting Front-en...,https://www.pluralsight.com/courses/tactics-to...,Shelley Benhoff,3.7,1.625833,At the core of any fully responsive website is...,2,54.0,
4,ps_5,Developing Web Applications and Web APIs Prote...,https://www.pluralsight.com/courses/developing...,Sahil Malik,4.1,2.447222,A large percentage of applications are accesse...,2,59.0,


# Pre-processing of text features

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [5]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shantanujoshi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shantanujoshi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shantanujoshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
concatenated_course_data_copy = all_courses_df.copy()

In [7]:
# Removing non-ascii characters
concatenated_course_data_copy = concatenated_course_data_copy[concatenated_course_data_copy['course_title'].apply(lambda x: all(ord(char) < 128 for char in x))]

In [8]:
concatenated_course_data_copy.reset_index(drop=True, inplace=True) 

In [9]:
len(concatenated_course_data_copy)

6343

In [10]:
# Define text preprocessing function
def preprocess_text(text):
    
    # for words like next.js react.js
    text = re.sub(r'\w+\.\w+', lambda x: x.group().replace('.', ''), text)
    
    # Remove non-alphabetic characters and lowercase the text
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower().replace('\r\n', '')
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Remove stopwords, perform stemming, and lemmatization
    stop_words = set(stopwords.words('english'))
    
    # stopwrods in the context of web development
    custom_stopwords = {'learn', 'course', 'sure', 'finally', 'understand', 'also', 'like', 'take', 'get', 'need', 
                        'know', 'go', 'start', 'use', 'create'}
    stop_words.update(custom_stopwords)
    
    # Using both stemmer and lemmatizer as some words like explore won't work correctly for lemmatizer but works well for stemmer and vice-versa
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [11]:
# Apply text preprocessing to the 'course_title' column
concatenated_course_data_copy['title_preprocessed'] = concatenated_course_data_copy['course_title'].apply(preprocess_text)

In [12]:
# Apply text preprocessing to the 'course_details' column alone
concatenated_course_data_copy['details_preprocessed'] = concatenated_course_data_copy['course_details'].apply(preprocess_text)

In [13]:
level_counts = concatenated_course_data_copy['course_level'].value_counts()


def calc_freq(level):
    
    return (level_counts[level]/len(concatenated_course_data_copy))
    

concatenated_course_data_copy['course_level_freq'] = concatenated_course_data_copy['course_level'].apply(calc_freq)

In [14]:
concatenated_course_data_copy.head()

Unnamed: 0,course_id,course_title,course_url,course_instructor,course_rating,course_duration,course_details,course_level,course_no_of_reviews,course_no_of_enrolled,title_preprocessed,details_preprocessed,course_level_freq
0,ps_1,Web Development: Executive Briefing,https://www.pluralsight.com/courses/web-develo...,Brice Wilson,4.6,0.500556,Tech leaders need a fundamental understanding ...,1,158.0,,web develop execut brief,tech leader fundament understand tool technolo...,0.34542
1,ps_2,Front End Web Development: Get Started,https://www.pluralsight.com/courses/front-end-...,Joe Eames,4.6,3.799167,Front end web development involves many differ...,1,1250.0,,front end web develop start,front end web develop involv mani differ techn...,0.34542
2,ps_3,Beyond ASP.NET MVC: Modern Web Development Dem...,https://www.pluralsight.com/courses/beyond-asp...,Chris Jones,4.6,1.8425,The web development landscape is constantly ch...,1,75.0,,beyond aspnet mvc modern web develop demystifi,web develop landscap constantli chang hard kee...,0.34542
3,ps_4,Tactics and Tools for Troubleshooting Front-en...,https://www.pluralsight.com/courses/tactics-to...,Shelley Benhoff,3.7,1.625833,At the core of any fully responsive website is...,2,54.0,,tactic tool troubleshoot front end web develop,core fulli respons websit thorough knowledg tr...,0.177676
4,ps_5,Developing Web Applications and Web APIs Prote...,https://www.pluralsight.com/courses/developing...,Sahil Malik,4.1,2.447222,A large percentage of applications are accesse...,2,59.0,,develop web applic web api protect azur activ ...,larg percentag applic access via browser rest ...,0.177676


# Use text vectorizers to form feature matrix
### ( Calculate cosine similarity for these features )

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, hstack
import numpy as np

In [19]:
# Separate Vectorizers for different features
tfidf_vectorizer_title = TfidfVectorizer()
tfidf_vectorizer_title_bigrams = TfidfVectorizer(ngram_range = (2,2), min_df = 2)  # only bigrams occurring in at least two courses
tfidf_vectorizer_details = TfidfVectorizer()

# calculate TF-IDF matrices for each feature set
tfidf_title_matrix = tfidf_vectorizer_title.fit_transform(concatenated_course_data_copy['title_preprocessed'])
tfidf_title_bigrams_matrix = tfidf_vectorizer_title_bigrams.fit_transform(concatenated_course_data_copy['title_preprocessed'])
tfidf_details_matrix = tfidf_vectorizer_details.fit_transform(concatenated_course_data_copy['details_preprocessed'])

# Give extra weightage to title features
tfidf_title_matrix_weighted = tfidf_title_matrix * 1.25
tfidf_title_bigrams_matrix_weighted = tfidf_title_bigrams_matrix * 1.75

# Also add course_level as a feature
course_level_matrix = csr_matrix(np.array(concatenated_course_data_copy['course_level_freq']).reshape(-1, 1))

# Combine all features into a single feature matrix
concatenated_tfidf_matrix = hstack((tfidf_title_matrix_weighted, tfidf_title_bigrams_matrix_weighted, tfidf_details_matrix, course_level_matrix))

# calculate pair-wise cosine similarity
cosine_sim_weighted = cosine_similarity(concatenated_tfidf_matrix)

In [20]:
tfidf_details_matrix.shape

(6343, 16043)

In [21]:
concatenated_tfidf_matrix.shape

(6343, 21667)

**A total of 21667 features**

In [22]:
cosine_sim_weighted.shape

(6343, 6343)

# Recommendation function

In [23]:
# Function to recommend courses based on cosine similarity
def recommend_courses(course_title, cosine_sim_matrix, df, top_n=5):
    # Find the index of the course title in the DataFrame
    idx = df.index[df['course_title'] == course_title].tolist()[0]
    
    # Get the cosine similarity scores for the given course
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort the courses based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top n similar courses (excluding itself)
    top_similar_courses = sim_scores[1:top_n+1]
    
    # Get the indices of the top similar courses
    course_indices = [idx for idx, _ in top_similar_courses]
    
    # Get the course names corresponding to the indices
    recommended_courses = df.iloc[course_indices]
    
    return recommended_courses

In [24]:
course_title = 'Web Development: Executive Briefing'
recommended_courses = recommend_courses(course_title, cosine_sim_weighted, concatenated_course_data_copy)
print(f"Recommended courses for '{course_title}':")
recommended_courses.head()[['course_id','course_title', 'course_level']]

Recommended courses for 'Web Development: Executive Briefing':


Unnamed: 0,course_id,course_title,course_level
216,ce_101067,How to Get Into Web Development,1
6048,ud_2636020,Web development road map series,1
1486,ud_1093608,The Most Comprehensive Web Development Course,0
5713,ud_5490334,Non Technical to a amazing Web Developer,0
9,ps_10,Web Development with ExpressJS,2


In [25]:
course_title = 'Front End Web Development: Get Started'
recommended_courses = recommend_courses(course_title, cosine_sim_weighted, concatenated_course_data_copy)
print(f"Recommended courses for '{course_title}':")
recommended_courses.head()[['course_id','course_title', 'course_level']]

Recommended courses for 'Front End Web Development: Get Started':


Unnamed: 0,course_id,course_title,course_level
2283,ud_1460662,JavaScript: Learn Front End Web Development,0
5729,ud_311538,Foundations of Front-End Web Development,1
195,ce_101046,Getting Started with Front-End and Web Develop...,1
5182,ud_5404420,Straightforwardy Learn Front-End Web Development,0
3423,ud_5273224,Front end web development in Rust,2


In [26]:
course_title = 'Next.js - Build Full Stack Apps with Next.js & TypeScript'
recommended_courses = recommend_courses(course_title, cosine_sim_weighted, concatenated_course_data_copy)
print(f"Recommended courses for '{course_title}':")
recommended_courses.head()[['course_id','course_title', 'course_level']]

Recommended courses for 'Next.js - Build Full Stack Apps with Next.js & TypeScript':


Unnamed: 0,course_id,course_title,course_level
4897,ud_5413116,"MERN Stack vs Next.js , Build A Full-stack App...",1
5435,ud_5669998,Full Stack Markdown App With Next.js,2
4036,ud_5484748,Next.js 13 Bootcamp - From Scratch to Full-sta...,1
2556,ud_5360872,Building a Full-stack Multilingual Blog with N...,0
5080,ud_4530202,Learn NextJs (App Router) by building a full-s...,0


In [29]:
# Code to convert the similarity weights to pickle file which will be used by backend
#import pickle
#pickle.dump(cosine_sim_weighted ,open('tf_idf_cosine_similarity.pkl','wb'))