In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Importing the data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
data=pd.read_csv('/kaggle/input/udemy-courses/udemy_courses.csv')

In [None]:
data.head()

## Exploring the data

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe().transpose()

In [None]:
data.level.unique()

In [None]:
data.subject.unique()

In [None]:
## Published timestamp has datatype as object, need to convert it to time data type

data.published_timestamp = pd.to_datetime(data.published_timestamp)

In [None]:
data.info()

In [None]:
## Removing the unwanted columns.

data.drop(columns=['course_id','url'], inplace=True)

In [None]:
data.head()

In [None]:
data.describe().transpose()

## Insights

In [None]:
data.head(10)

In [None]:
data_sub_count = data.subject.value_counts()

In [None]:
data_sub_count

In [None]:
data_sub_count = data.subject.value_counts().reset_index()

In [None]:
data_sub_count

In [None]:
plt.figure(figsize=(10,6))
v1 = sns.barplot(x = "subject",y = "index",hue = "index", data = data_sub_count)

Here we can see that subject Web development has more courses and Graphic Design has least courses.

In [None]:
## PRice range of the courses

v2 = sns.histplot(data = data, x = "price")

Most of the courses has price range of 25

In [None]:
# Paid vs Free courses

data_ispaid = data.is_paid.value_counts().reset_index()
data_ispaid

In [None]:
plt.figure(figsize=(10,6))
v3 = plt.pie(data_ispaid['is_paid'], labels = data_ispaid['index'])

Most of the courses are paid courses

In [None]:
# Levels

data_level_count = data.level.value_counts().reset_index()
plt.figure(figsize=(10,6))
v4 = sns.barplot(x = "index",y = "level",hue = "index", data = data_level_count)


Most of the courses are for All Levels, and followed by Beginner Level and the least is Expert Level

In [None]:
## most subscribers

data_subscribe_sort = data.sort_values(['num_subscribers'], ascending = False).head(5)
data_subscribe_sort

In [None]:
v5 = sns.barplot(x = "num_subscribers",y = "course_title", data = data_subscribe_sort)

Top 5 most subscribed courses

In [None]:
# courses with most number of reviews

data_reviews_sort = data.sort_values(['num_reviews'], ascending = False).head(5)
data_reviews_sort

In [None]:
v6 = sns.barplot(x = "num_reviews",y = "course_title", data = data_reviews_sort)

In [None]:
## Making separate column which has year of publishing
data['publish_year'] = pd.DatetimeIndex(data['published_timestamp']).year
data.head(4)

In [None]:
data_year_count = data['publish_year'].value_counts().reset_index()
data_year_count

In [None]:
v7 = sns.barplot(x = "index",y = "publish_year", data = data_year_count)

- Here we can understand that more courses were published in the year 2016
- There was increase in publishing of courses from 2011 to 2016, but there was a decline in publishing from 2016 - 2017

In [None]:
## Courses with no subscribers at all

data_with0_subscribers = data.loc[data.num_subscribers == 0]
data_with0_subscribers.head(4)

In [None]:
data_with0_subscribers.num_subscribers.value_counts()[0]

We have 70 courses which are not used or not Subscribed by anyone.

In [None]:
## Price/subject on basis of the levels

plt.figure(figsize=(10,6))
v6 = sns.barplot(x='subject',y='price',data = data ,hue = 'level')

The price for all the Expert level course is higher except the Musical Instruments subject

In [None]:
## Subscribers vs time duration scatterplot

plt.figure(figsize=(10,6))
v7 = sns.scatterplot(x='num_subscribers',y='content_duration',data=data)

- Here we can understand that most of the courses with large subscribers has less time duration
- And the courses with more duration has less Subscribers.

In [None]:
data_nouse_courses = data_with0_subscribers.subject.value_counts().reset_index()
data_nouse_courses

In [None]:
data_with0_subscribers.subject.unique()

In [None]:
plt.figure(figsize=(10,6))
v8 = plt.pie(data_nouse_courses['subject'], labels = data_nouse_courses['index'])

- Here we can see that most of the courses that are not subscribed are Business Finance related.
- All the courses from Web Development has atleast one subscriber.

In [None]:
data_free_course = data.loc[data.is_paid == False]
data_free_course.head(4)

- There are 310 courses which are free.

In [None]:
## Free courses.

plt.figure(figsize=(10,6))
data_free_count = data_free_course['subject'].value_counts().reset_index()
v9 = plt.pie(data_free_count['subject'], labels = data_free_count['index'])

- Most of the free courses are related to Web Development and least is Graphic Design.

In [None]:
data_Business = data.loc[data.subject == 'Business Finance']
data_Web = data.loc[data.subject == 'Web Development']
data_Music = data.loc[data.subject == 'Musical Instruments']
data_Graphic = data.loc[data.subject == 'Graphic Design']

data_subs_bussort = data_Business.sort_values(['num_subscribers'], ascending = False).head(5)
data_subs_websort = data_Web.sort_values(['num_subscribers'], ascending = False).head(5)
data_subs_musicsort = data_Music.sort_values(['num_subscribers'], ascending = False).head(5)
data_subs_graphsort = data_Graphic.sort_values(['num_subscribers'], ascending = False).head(5)

## Subject Wise top Subscribed courses in Udemy. 

Top Courses in Business Finance segment

In [None]:
data_subs_bussort

Top Courses in Web Development segment

In [None]:
data_subs_websort

Top Courses in Musical Instruments segment

In [None]:
data_subs_musicsort

Top Courses in Graphic Design segment

In [None]:
data_subs_graphsort