In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/content/udemy_courses.csv')

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
df.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance


In [None]:
df.shape

(3678, 12)

In [None]:
df.columns

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

#course_id : A unique identifier for each course.
#course_title: The title of the course, describing its content.
#url :A link to the course page on Udemy.
#is_paid : A boolean (True/False) indicating whether the course is paid or free.
#price:The cost of the course (if paid). Free courses might have a price of 0.
#num_subscribers : The total number of students enrolled in the course.
#num_reviews : The total number of reviews given by students.
#num_lectures :The total number of lectures included in the course.
#level: The difficulty level of the course (e.g., Beginner, Intermediate, Expert).
#content_duration: The total duration of the course content (usually in hours).
#published_timestamp : The date and time when the course was published on Udemy.
#subject:The category or subject of the course (e.g., Business, IT, Music).

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
course_id,0
course_title,0
url,0
is_paid,0
price,0
num_subscribers,0
num_reviews,0
num_lectures,0
level,0
content_duration,0


In [None]:
df.duplicated().any()

True

In [None]:
df=df.drop_duplicates()

In [None]:
df.duplicated().any()

False

Popularity based recommendation system

In [None]:
def pop_rem(df):
  df['pop_score']=0.6*df['num_subscribers']*0.4*df['num_reviews']
  df_sorted=df.sort_values(by='pop_score',ascending=False)[['course_title','pop_score']].head()
  return df_sorted

In [None]:
pop_rem(df)

Unnamed: 0,course_title,pop_score
3230,The Web Developer Bootcamp,800849500.0
3232,The Complete Web Developer Course 2.0,615946300.0
2827,Learn HTML5 Programming From Scratch,556928800.0
3204,Angular 4 (formerly Angular 2) - The Complete ...,347942900.0
3247,JavaScript: Understanding the Weird Parts,324358400.0


In [None]:
df.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,pop_score
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,11851.44


content based recommendation system

In [None]:
df.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,pop_score
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,11851.44


In [None]:
!pip install neattext


Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/114.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [None]:
import neattext.functions as nfx

In [None]:
df['course_title']=df['course_title'].apply(nfx.remove_stopwords)
df['course_title']=df['course_title'].apply(nfx.remove_special_characters)

In [None]:
df['course_title']

Unnamed: 0,course_title
0,Ultimate Investment Banking Course
1,Complete GST Course Certification Grow Practice
2,Financial Modeling Business Analysts Consultants
3,Beginner Pro Financial Analysis Excel 2017
4,Maximize Profits Trading Options
...,...
3673,Learn jQuery Scratch Master JavaScript library
3674,Design WordPress Website Coding
3675,Learn Build Polymer
3676,CSS Animations Create Amazing Effects Website


In [None]:
df['title_subject'] = df['course_title'] + ' ' + df['subject']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv=CountVectorizer(max_features=3000)
vectors=cv.fit_transform(df['title_subject']).toarray()

In [None]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
df.shape

(3672, 14)

In [None]:
len(cv.get_feature_names_out())


3000

In [None]:
vectors.shape

(3672, 3000)

Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity=cosine_similarity(vectors)

In [None]:
similarity

array([[1.        , 0.4330127 , 0.40824829, ..., 0.        , 0.        ,
        0.        ],
       [0.4330127 , 1.        , 0.35355339, ..., 0.        , 0.        ,
        0.        ],
       [0.40824829, 0.35355339, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.31622777,
        0.50709255],
       [0.        , 0.        , 0.        , ..., 0.31622777, 1.        ,
        0.26726124],
       [0.        , 0.        , 0.        , ..., 0.50709255, 0.26726124,
        1.        ]])

In [None]:
similarity.shape

(3672, 3672)

In [None]:
similarity[0]

array([1.        , 0.4330127 , 0.40824829, ..., 0.        , 0.        ,
       0.        ])

In [None]:
sorted(enumerate(similarity[0]),reverse=True,key=lambda x:x[1])[1][1:6]

(0.7715167498104596,)

In [None]:
df.iloc[39]['course_title']

'Complete Investment Banking Course 2017'

In [None]:
def content_recomm(course):
    course_index = df[df['course_title']==course].index[0]
    slm = similarity[course_index]
    course_list = sorted(enumerate(slm),reverse=True,key=lambda x:x[1])[1:6]
    for i in course_list:
        print(df.iloc[i[0]]['course_title'])


In [None]:
content_recomm('Complete Investment Banking Course 2017')

Ultimate Investment Banking Course
Complete Financial Analyst Course 2017
Cryptocurrency BTC  ETH Investment  Trading Course 2017
2017
Complete Short Course Ethereum


In [None]:
course_index=df[df['course_title']=='Complete Investment Banking Course 2017'].index[0]

In [None]:
course_index

39