In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("udemy_courses.csv")
df.head(3)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance


In [3]:
df.shape

(3678, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [5]:
# checking null values
df.isna().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [6]:
df.duplicated().sum()

6

In [7]:
df[df.duplicated()]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [8]:
df = df.drop_duplicates(keep="first")

In [9]:
df.duplicated().sum()

0

Popularity based recommendation system


In [10]:
def pop_remmdtn(df):
  # calculating popularity score for each course
  df["pop_score"] = 0.6*df['num_subscribers'] + 0.4*df["num_reviews"]
  # sorting based on pop_score
  df_sorted = df.sort_values(by="pop_score", ascending=False)
  return df_sorted[["course_title", "pop_score"]].reset_index()

In [11]:
pop_remmdtn(df).head(3)

Unnamed: 0,index,course_title,pop_score
0,2827,Learn HTML5 Programming From Scratch,164805.4
1,3032,Coding for Entrepreneurs Basic,96729.0
2,3230,The Web Developer Bootcamp,83928.4


Content Based Recommendation System


In [12]:
# recommendation based on past history

In [13]:
df.head(3)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,pop_score
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,1297.4
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,2044.4
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance,1334.0


In [14]:
# using the course title for the recommendation
# removing stop words(articles, prepositions, etc) as well as special characters as they dont contribute meaningful information to the text
# !pip install neattext    # a simple NLP package for cleaning text

In [15]:
import neattext.functions as nfx

In [16]:
df["course_title"]

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3673    Learn jQuery from Scratch - Master of JavaScri...
3674    How To Design A WordPress Website With No Codi...
3675                        Learn and Build using Polymer
3676    CSS Animations: Create Amazing Effects on Your...
3677    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3672, dtype: object

In [17]:
df.course_title = df.course_title.apply(nfx.remove_stopwords)
df.course_title = df.course_title.apply(nfx.remove_special_characters)
df.course_title

0                      Ultimate Investment Banking Course
1       Complete GST Course  Certification  Grow Practice
2        Financial Modeling Business Analysts Consultants
3             Beginner Pro  Financial Analysis Excel 2017
4                        Maximize Profits Trading Options
                              ...                        
3673      Learn jQuery Scratch  Master JavaScript library
3674                      Design WordPress Website Coding
3675                                  Learn Build Polymer
3676        CSS Animations Create Amazing Effects Website
3677              MODX CMS Build Websites Beginners Guide
Name: course_title, Length: 3672, dtype: object

In [18]:
df["title_subject"] = df["course_title"] + " " + df["subject"]
# recommending course not just based on the course title but also on the subject
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,pop_score,title_subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,1297.4,Ultimate Investment Banking Course Business Fi...
1,1113822,Complete GST Course Certification Grow Practice,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,2044.4,Complete GST Course Certification Grow Pract...
2,1006314,Financial Modeling Business Analysts Consultants,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance,1334.0,Financial Modeling Business Analysts Consultan...
3,1210588,Beginner Pro Financial Analysis Excel 2017,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance,1475.0,Beginner Pro Financial Analysis Excel 2017 Bu...
4,1011058,Maximize Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance,783.6,Maximize Profits Trading Options Business Finance


In [19]:
# converting these texts into vectors using count vectorizer(converts texts into numerical format)
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv = CountVectorizer(max_features=3000)   # (forms matrix)
# cv will only include top 3000 most frequent words from the corpus(collection of all course titles)
vectors = cv.fit_transform(df.title_subject).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
vectors.shape

(3672, 3000)

In [22]:
len(cv.get_feature_names_out()) # gives all the 3000 most frequent words

3000

In [23]:
# the similarities between the vectors is utilized in building the reccomendation system
# Cosine Similarity - metric used to measure the similarity between two vectors in a multi-dimensional space
# by calculating the cosine of the angles between the two vectors
# -1 ot 1

# CS(A, B) = (A.B)/(||A||*||B||)

In [24]:
# recommending courses which has high cosine similarity

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.4330127 , 0.40824829, ..., 0.        , 0.        ,
        0.        ],
       [0.4330127 , 1.        , 0.35355339, ..., 0.        , 0.        ,
        0.        ],
       [0.40824829, 0.35355339, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.31622777,
        0.50709255],
       [0.        , 0.        , 0.        , ..., 0.31622777, 1.        ,
        0.26726124],
       [0.        , 0.        , 0.        , ..., 0.50709255, 0.26726124,
        1.        ]])

In [27]:
similarity.shape # similarity with each other

(3672, 3672)

In [28]:
similarity[0]

array([1.        , 0.4330127 , 0.40824829, ..., 0.        , 0.        ,
       0.        ])

In [29]:
sorted(similarity[0], reverse=True)[:10]
# to identify top similarity words

[1.0000000000000002,
 0.7715167498104596,
 0.6666666666666669,
 0.6666666666666669,
 0.6172133998483676,
 0.6172133998483676,
 0.6172133998483676,
 0.5773502691896258,
 0.5773502691896258,
 0.5773502691896258]

In [30]:
# but then the index of the words is lost. To get the index as well, we use enumerate
sim1 = sorted(enumerate(similarity[0]), reverse=True, key=lambda x : x[1]) # key specified to sort on the basis of cosine similarity and not its index
sim1[:10]

[(0, 1.0000000000000002),
 (39, 0.7715167498104596),
 (240, 0.6666666666666669),
 (417, 0.6666666666666669),
 (418, 0.6172133998483676),
 (657, 0.6172133998483676),
 (1066, 0.6172133998483676),
 (227, 0.5773502691896258),
 (279, 0.5773502691896258),
 (280, 0.5773502691896258)]

In [31]:
sim1[:6]
# instead of the index, we want course title

[(0, 1.0000000000000002),
 (39, 0.7715167498104596),
 (240, 0.6666666666666669),
 (417, 0.6666666666666669),
 (418, 0.6172133998483676),
 (657, 0.6172133998483676)]

In [32]:
df.loc[df.index==39, ["title_subject"]]

Unnamed: 0,title_subject
39,Complete Investment Banking Course 2017 Busine...


In [33]:
df.iloc[39]["title_subject"]

'Complete Investment Banking Course 2017 Business Finance'

In [34]:
df.iloc[39]["course_title"]

'Complete Investment Banking Course 2017'

In [35]:
def content_recomm(course):
  course_idx = df.loc[df.course_title == course].index[0]
  sim = similarity[course_idx]
  course_recomm = sorted(enumerate(sim), reverse=True, key=lambda x : x[1])[:5]  # top 5 recommendations
  for i in course_recomm:
    print(df.iloc[i[0]]["course_title"])

In [36]:
df.sample(1)["course_title"]

1669    Corel Draw X7 Eitimi Temelden leri Seviyeye
Name: course_title, dtype: object

In [37]:
content_recomm("Graphic Design Secrets Revealed")
# top 5 courses recommended based on the given course

Surface Pattern Design  Create Scandinavian Patterns
Illustrative Surface Patterns Design  Life Products
Gestalt Principles  Pattern Design  Create Festive Pattern




In [38]:
content_recomm("Complete Investment Banking Course 2017")

Complete Investment Banking Course 2017
Ultimate Investment Banking Course
Complete Financial Analyst Course 2017
Cryptocurrency BTC  ETH Investment  Trading Course 2017
2017


In [39]:
content_recomm("Ultimate Investment Banking Course")

Ultimate Investment Banking Course
Complete Investment Banking Course 2017
Advanced Accounting Investment Banking
Investment Banking Recruitment Series
Business Banking 101


In [40]:
# here course recommendation is done not just on the basis of course_title but also on the  basis of subject - hence provides better recommendation

In [41]:
import pickle
# used for serializing and deserializing objects
# convert complex Python objects, such as DataFrames, into a byte stream that can be saved to a file.

In [42]:
pickle.dump(df, open("course_dict.pkl", "wb"))
# This is a function in the pickle module used to serialize an object (in this case, the DataFrame) and save it to a file.

In [43]:
pickle.dump(similarity, open("similarity.pkl", "wb"))

In [44]:
import tkinter as tk
from tkinter import ttk, messagebox
# the tkinter module in Python is a standard GUI (Graphical User Interface) toolkit
# to define and create a GUI window, add widgets (such as buttons, labels, etc.), and set up event handlers.

def popularity_based_recommendation(df, top_n=5):
    df["popularity_score"] = 0.6*df["num_subscribers"] + 0.4*df["num_reviews"]
    df_sorted = df.sort_values(by="popularity_score", ascending=False)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    return recommended_courses

# define recommend function
def recommend(course):
    try:
        course_idx = df.loc[df.course_title == course].index[0]
        sim = similarity[course_idx]
        courses_list = sorted(enumerate(sim), reverse=True, key=lambda x : x[1])[:5]  
        recommended_courses = [df.iloc[i[0]]["course_title"] for i in courses_list]  # top 5 recommendations
        return recommended_courses
    except IndexError:
        messagebox.showerror("Error", f"Course {course} not found.")

# Event handler for the "Recommend" button
def recommend_btn_click():
    course_title = course_var.get()
    recommended_courses = recommend(course_title)
    if recommended_courses:
        popularity_label.pack_forget()
        result_label.config(text="Recommended Courses:\n\n"+"\n".join(recommended_courses))


# create the main application window
root = tk.Tk()
root.title("Course Recommender")
root.geometry("400x300")

# change the font and the color
font_style = ("Arial", 12)
label_color = "blue"
heading_color="red"
button_color = "green"
result_label_color = "black"

# Create and place GUI elements
label = tk.Label(root, text="Select Course:", font=font_style, fg=label_color)
label.pack(pady=10)

course_titles = df['course_title'].tolist()
course_var = tk.StringVar(value=course_titles[0])
course_dropdown = ttk.Combobox(root, textvariable=course_var, values=course_titles, width=40, font=font_style)
course_dropdown.pack(pady=5)

popularity_recommendations = popularity_based_recommendation(df, top_n=5)
popularity_label = tk.Label(root, text="Popularity-based Recommendations:\n\n" + popularity_recommendations.to_string(index=False),
                             font=font_style, fg=label_color)
popularity_label.pack()

recommend_button = tk.Button(root, text="Recommend", command=recommend_btn_click, width=20, font=font_style, fg=button_color)
recommend_button.pack(pady=10)

result_label = tk.Label(root, text="", wraplength=350, font=font_style, fg=result_label_color)
result_label.pack()

root.mainloop()