## Course Recommendation System

In [6]:
# Importing the required libraries
import pandas as pd
import numpy as np
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 20)

In [7]:
# Loading the data
df = pd.read_csv(r'C:\Users\shail\OneDrive\Shailesh\Personal\Personal Learning\Udemy Projects\Course Recommendation System\udemy_course_data.csv')
df

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,775618,Learn jQuery from Scratch - Master of JavaScri...,https://www.udemy.com/easy-jquery-for-beginner...,True,100,1040,14,21,All Levels,2 hours,2016-06-14T17:36:46Z,Web Development,104000,2016-06-14,17:36:46Z,2016,6,14
3679,1088178,How To Design A WordPress Website With No Codi...,https://www.udemy.com/how-to-make-a-wordpress-...,True,25,306,3,42,Beginner Level,3.5 hours,2017-03-10T22:24:30Z,Web Development,7650,2017-03-10,22:24:30Z,2017,3,10
3680,635248,Learn and Build using Polymer,https://www.udemy.com/learn-and-build-using-po...,True,40,513,169,48,All Levels,3.5 hours,2015-12-30T16:41:42Z,Web Development,20520,2015-12-30,16:41:42Z,2015,12,30
3681,905096,CSS Animations: Create Amazing Effects on Your...,https://www.udemy.com/css-animations-create-am...,True,50,300,31,38,All Levels,3 hours,2016-08-11T19:06:15Z,Web Development,15000,2016-08-11,19:06:15Z,2016,8,11


In [8]:
# Apply natural language preprocessing on the 'course_title' column

# Step 1: Remove stopwords from the course titles
df['Clean_Title'] = df['course_title'].apply(nfx.remove_stopwords)

# Step 2: Remove special characters from the cleaned titles
df['Clean_Title'] = df['Clean_Title'].apply(nfx.remove_special_characters)

df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,Clean_Title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18,Ultimate Investment Banking Course
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9,Complete GST Course Certification Grow Practice
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19,Financial Modeling Business Analysts Consultants
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30,Beginner Pro Financial Analysis Excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,Maximize Profits Trading Options


### Vectorising 'Clean_Title' column

In [9]:
# Creating an instance of CountVectorizer
countvect = CountVectorizer()

# Applying CountVectorizer on the 'Clean_Title' column of the DataFrame
# This converts the cleaned text titles into a document-term matrix (sparse matrix)
cv_mat = countvect.fit_transform(df['Clean_Title'])

# Displaying the resulting sparse matrix (document-term matrix)
cv_mat

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 18364 stored elements and shape (3683, 3564)>

### Calculating the Cosine_Similarity

In [10]:
# Calculating the cosine similarity matrix for the document-term matrix (cv_mat)
# Each value represents the similarity between two documents based on their word counts
cos_sim = cosine_similarity(cv_mat)

# Printing the shape of the cosine similarity matrix
# It should be (n, n), where n is the number of documents (rows in 'Clean_Title')
print(cos_sim.shape)

# Printing a separator line for clarity
print()
print('-' * 80)
print()

# Printing the full cosine similarity matrix
# Each row and column represent a document, and the values indicate similarity between them
print(cos_sim)

(3683, 3683)

--------------------------------------------------------------------------------

[[1.         0.20412415 0.         ... 0.         0.         0.        ]
 [0.20412415 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.23570226]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.23570226 0.         1.        ]]


### Building the Recommendation System

In [11]:
title = 'Ultimate Investment Banking Course'

In [12]:
# Creating a Series where course titles are the index and their corresponding DataFrame indices are the values
# 'drop_duplicates()' ensures there are no duplicate course titles
course_index = pd.Series(df.index, index=df['course_title']).drop_duplicates()

# Retrieving the index of the course the user is interested in (assumed to be stored in the variable 'title')
index = course_index[title]

# Getting a list of tuples containing index and similarity score for all courses compared to the selected one
scores = list(enumerate(cos_sim[index]))

# Sorting the list of tuples based on similarity scores in descending order
# The most similar course (itself) will be at index 0
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Extracting the indices of the most similar courses (excluding the first one which is the same course)
selected_course_index = [i[0] for i in sorted_scores[1:]]

# Extracting the corresponding similarity scores
selected_course_score = [i[1] for i in sorted_scores[1:]]

# Fetching the rows of the recommended courses using their indices
recommended_df = df.iloc[selected_course_index]

# Adding the similarity score as a new column to the DataFrame
recommended_df['Similarity_Score'] = selected_course_score

# Selecting and renaming relevant columns for the final recommendation output
recommended_courses = recommended_df[['course_title', 'Similarity_Score', 'url', 'price', 'num_subscribers']].rename(
    columns={
        'course_title': 'Course_Title',
        'url': 'URL',
        'price': 'Price',
        'num_subscribers': 'Number_Of_Subscribers'
    }
)

# Displaying the recommended courses
recommended_courses


Unnamed: 0,Course_Title,Similarity_Score,URL,Price,Number_Of_Subscribers
39,The Complete Investment Banking Course 2017,0.67082,https://www.udemy.com/the-complete-investment-...,195,8575
3479,The Ultimate jQuery Course,0.57735,https://www.udemy.com/essential-jquery-training/,20,1098
242,Advanced Accounting for Investment Banking,0.50000,https://www.udemy.com/advanced-accounting-for-...,50,1260
419,The Investment Banking Recruitment Series,0.50000,https://www.udemy.com/investmentbanking/,40,17
2719,The Ultimate Web Development Course,0.50000,https://www.udemy.com/build-an-instant-update-...,50,9881
...,...,...,...,...,...
3678,Learn jQuery from Scratch - Master of JavaScri...,0.00000,https://www.udemy.com/easy-jquery-for-beginner...,100,1040
3679,How To Design A WordPress Website With No Codi...,0.00000,https://www.udemy.com/how-to-make-a-wordpress-...,25,306
3680,Learn and Build using Polymer,0.00000,https://www.udemy.com/learn-and-build-using-po...,40,513
3681,CSS Animations: Create Amazing Effects on Your...,0.00000,https://www.udemy.com/css-animations-create-am...,50,300


In [14]:
recommended_courses.to_csv('Cleaned_Course_Data.csv', index = False)