In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="section-one"></a>
# Reading the Datasets

In [None]:
udemy_data = pd.read_csv("/kaggle/input/udemy-courses/udemy_courses.csv")
udemy_data.head()

<a id="section-two"></a>
# Exploratory Data Analysis
1. [Initial Cleaning](#eda-zero)
2. [Data Information](#eda-one)
3. [Which type of Courses have most number of Subscribers](#eda-two)
4. [Which Subject has most number of Subscribers](#eda-three)
5. [Which number of Lectures range has most Subscribers](#eda-3b)
6. [Which Course level has most number of Subscribers](#eda-four)
7. [Which Content duration range has most number of subscribers](#eda-five)

<a id="eda-zero"></a>
### Initial Cleaning

In [None]:
# Change into boolean form
udemy_data.is_paid.replace(['FALSE', 'https://www.udemy.com/learnguitartoworship/'], 'False', inplace = True)
udemy_data.is_paid.replace('TRUE', 'True', inplace = True)
udemy_data.level.replace('52', 'All Levels', inplace = True)
udemy_data = udemy_data.drop_duplicates().reset_index(drop=True)

<a id="eda-one"></a>
### Data Information

In [None]:
udemy_data.info

In [None]:
# Drop the columns that are irrelevant

udemy_data.drop(['course_id','url'], axis=1, inplace=True)

<a id="eda-two"></a>
### Which type of Courses have most number of Subscribers ----------> UnPaid

In [None]:
subscribers = udemy_data.groupby('is_paid')['num_subscribers'].agg('sum').to_frame()
fig = px.pie(subscribers, values='num_subscribers', names= ['Paid', 'UnPaid'], title='Subscribers Correlation Chart')
fig.show()


<a id="eda-three"></a>
### Which Subject have most number of Subscribers ----------> Web Development

In [None]:
subjects = udemy_data.groupby('subject')['num_subscribers'].agg('sum').to_frame()
names = udemy_data['subject'].unique()
fig = px.pie(subjects, values='num_subscribers', names= names , title='Subscribers and Subject Correlation Chart')
fig.update_traces(rotation=90)
fig.show()

<a id="eda-3b"></a>
### Which number of Lectures range has most Subscribers ----------> <25 Lectures

In [None]:
# Converting the number of lectures into a range
bins = [0, 25, 50, 75, 100, 150, 200, 250, 300, 350, 400, 450, 500, np.inf]
bin_names =  ['<25', '25h-50', '50h-75', '75h-100', '100h-150', '150h-200', '200h-250', '250h-300', '300h-350', '350h-400', '400h-450', '450h-500', '500+']
udemy_data['lectures_range'] = pd.cut(udemy_data['num_lectures'], bins, labels=bin_names)

In [None]:
# Using the lectures_range column to find the correlation to number of subscribers
lectures = udemy_data.groupby('lectures_range')['num_subscribers'].agg('sum').to_frame()
fig = px.pie(lectures, values='num_subscribers', names= bin_names , title='Subscribers and Number of Lectures Correlation Chart')
fig.show()


<a id="eda-four"></a>
### Which Course level has most Subscribers ----------> All Levels

In [None]:
level = udemy_data.groupby('level')['num_subscribers'].agg('sum').to_frame()
names = udemy_data['level'].unique()
fig = px.pie(level, values='num_subscribers', names= sorted(names) , title='Subscribers and Course Level Correlation Chart')
fig.show()


<a id="eda-five"></a>
### Which Content duration range has most number of subscribers ----------> 0-5h

In [None]:
# Converting the number of lectures into a range
bins = [0, 5, 10, 15, 20, np.inf]
bin_names =  ['0-5h', '5-10h', '10-15h', '15-20h', '20h+']
udemy_data['content_range'] = pd.cut(udemy_data['content_duration'], bins, labels=bin_names)

In [None]:
# Using the content_range column to find the correlation to number of subscribers
lectures = udemy_data.groupby('content_range')['num_subscribers'].agg('sum').to_frame()
fig = px.pie(lectures, values='num_subscribers', names= bin_names , title='Subscribers and Content duration Correlation Chart')
fig.show()

<a id="section-three"></a>
# Analysis of Subjects and Courses
1. [Which subject has most number of reviews?](#sec3-1)
2. [Which subject has most high paid courses?](#sec3-2)
3. [Which subject has most paid courses?](#sec3-2b)
4. [Which subject has most free courses?](#sec3-3)
5. [Which course has most number of reviews?](#sec3-4)
6. [Which course has least number of reviews?](#sec3-5)
7. [Which course has most subscribers?](#sec3-6)
8. [Which course has least subscribers?](#sec3-7)

<a id="sec3-1"></a>
### Which subject has most number of reviews? ----------> Web Development

In [None]:
subjects = udemy_data.groupby('subject')['num_reviews'].sum().to_frame()
names = udemy_data['subject'].unique()

fig = px.pie(subjects, values='num_reviews', names= names , title='Reviews for each subject')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-2"></a>
### Which subject has most high paid courses? ----------> Web Development

In [None]:
paid_courses = udemy_data.loc[udemy_data['is_paid'] == True, ['price', 'subject']]
most_paid = udemy_data.groupby('subject')['price'].sum().reset_index()
names = paid_courses['subject'].unique()

fig = px.pie(most_paid, values='price', names= names , title='Most high paid Courses')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-2b"></a>
### Which subject has most paid courses? ----------> Web Development

In [None]:
paid_courses = udemy_data.loc[udemy_data['is_paid'] == True, ['is_paid', 'subject']]
most_paid = udemy_data.groupby('subject')['is_paid'].count().reset_index()
names = paid_courses['subject'].unique()

fig = px.pie(most_paid, values='is_paid', names= names , title='Most paid Courses')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-3"></a>
### Which subject has most free courses? ----------> Web Development

In [None]:
free_courses = udemy_data.loc[udemy_data['is_paid'] == False, ['is_paid', 'subject']]
most_free = free_courses.groupby('subject')['is_paid'].count().to_frame()
names = free_courses['subject'].unique()

fig = px.pie(most_free, values='is_paid', names= names , title='Most free Courses')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-4"></a>
### Which course has most number of reviews? ----------> The Web Developer Bootcamp

In [None]:
courses = udemy_data.loc[:, ['course_title','num_reviews']]
most_reviews = courses.groupby('course_title')['num_reviews'].max().reset_index()
most_reviews_sort = most_reviews.sort_values(by = 'num_reviews', ascending = False)
most_reviews_sort = most_reviews_sort.head(10)

fig = px.pie(most_reviews_sort, values='num_reviews', names= 'course_title' , title='Most Reviews for Course titles', template = 'seaborn')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-5"></a>
### Which course has least number of reviews? ----------> Multiple courses

In [None]:
courses = udemy_data.loc[:, ['course_title','num_reviews']]
least_reviews = courses.groupby('course_title')['num_reviews'].max().reset_index()
least_reviews = least_reviews[least_reviews['num_reviews'] > 10]
least_reviews_sort = least_reviews.sort_values(by = 'num_reviews')
least_reviews_sort = least_reviews_sort.head(10)

fig = px.pie(least_reviews_sort, values='num_reviews', names= 'course_title' , title='Least Reviews for Course titles', template = 'seaborn')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-6"></a>
### Which course has most number of subscribers? ----------> Learn HTML5 Programming From Scratch

In [None]:
courses = udemy_data.loc[:, ['course_title','num_subscribers']]
most_subs = courses.groupby('course_title')['num_subscribers'].max().reset_index()
most_subs_sort = most_subs.sort_values(by = 'num_subscribers', ascending = False)
most_subs_sort = most_subs_sort.head(10)

fig = px.pie(most_subs_sort, values='num_subscribers', names= 'course_title' , title='Most subscribers for Course titles', template = 'seaborn')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

<a id="sec3-7"></a>
### Which course has least number of subscribers? ----------> Multiple Courses

In [None]:
courses = udemy_data.loc[:, ['course_title','num_subscribers']]
least_subs = courses.groupby('course_title')['num_subscribers'].max().reset_index()
least_subs = least_subs[least_subs['num_subscribers'] > 10]
least_subs_sort = least_subs.sort_values(by = 'num_subscribers')
least_subs_sort = least_subs_sort.head(10)

fig = px.pie(least_subs_sort, values='num_subscribers', names= 'course_title' , title='Least subscribers for Course titles', template = 'seaborn')
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()