# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [None]:
courses=pd.read_csv("../input/udemy-courses/udemy_courses.csv")
courses.head(3)

In [None]:
courses.shape

In [None]:
courses.info()

In [None]:
#checking for null values 
courses.isnull().sum()

In [None]:
#checking for null values 
sns.heatmap(courses.isnull(),cbar=False,yticklabels=False)

In [None]:
courses.describe()

# Let's summarize what we have got from the dataset.
Our dataset has info about the courses given by UDEMY. 'Course ID' and Course 'url' would not be necessary for our analysis, we will drop them. Course published date is given object format, neeeds to be formatted as a datetime object. There is no missing value, which is very good during the data preparation stage. 'Level' column is categorical variable, it would be good to see whether any significant differences among the levels. Numerical variables deserves special attention for further analysis. Let's make the necessary adjustments before moving to the analysis part.

In [None]:
courses.head(1)

In [None]:
courses["date"]=pd.to_datetime(courses["published_timestamp"])
courses.head(1)

In [None]:
courses=courses.drop(["published_timestamp","url","course_id"],axis=1)
courses.head(1)

In [None]:
courses.info()

# Analysis Part

In [None]:
courses.describe()

# Prices of UDEMY Courses

In [None]:
px.histogram(courses, x='price')

# Number of Reviews of UDEMY Courses

In [None]:
px.histogram(courses, x= 'num_reviews')

# Number of Subscribers of UDEMY Courses

In [None]:
px.histogram(courses, x= "num_subscribers")

# Number of Lectures of UDEMY Courses

In [None]:
px.histogram(courses, x= 'num_lectures')

# Durations of UDEMY Courses

In [None]:
 px.histogram(courses, x= 'content_duration')

# UDEMY Courses Based on the Subject

In [None]:
courses['subject'].value_counts()

In [None]:
np.round(courses['subject'].value_counts(normalize=True),3)*100


In [None]:
px.histogram(courses,x="subject")

In [None]:
courses.head(1)

In [None]:
courses['year']= courses['date'].dt.year

In [None]:
courses.head(1)

In [None]:
courses['year'].value_counts()

In [None]:
year_2016=courses[courses['year']==2016]
year_2016["subject"].value_counts()

In [None]:
year_2017=courses[courses['year']==2017]
year_2017["subject"].value_counts()

In [None]:
year_2015=courses[courses['year']==2015]
year_2015["subject"].value_counts()

In [None]:
year_2014=courses[courses['year']==2014]
year_2014["subject"].value_counts()

In [None]:
year_2013=courses[courses['year']==2013]
year_2013["subject"].value_counts()

In [None]:
year_2012=courses[courses['year']==2012]
year_2012["subject"].value_counts()

In [None]:
year_2011=courses[courses['year']==2011]
year_2011["subject"].value_counts()

# Based on the Level of the Courses

In [None]:
courses["level"].value_counts()

In [None]:
np.round(courses["level"].value_counts(normalize=True),2)

In [None]:
#percentage
np.round(courses['level'].value_counts(normalize=True),2)*100


In [None]:
sns.countplot(x=courses["level"])

In [None]:
courses["level"].value_counts()

# UDEMY Courses- Number of Subscribers & Num of Reviews and Number of Lectures by Yea

In [None]:
year_2011=courses[courses['year']==2011]
year_2011["num_subscribers"].sum()

In [None]:
df1 = courses.groupby('year')[['num_subscribers','num_reviews','num_lectures']].sum().reset_index()

In [None]:
df1

In [None]:
px.line(df1, x='year', y=['num_subscribers','num_reviews','num_lectures'])

# Price & Courses

In [None]:
courses.head(1)

In [None]:
year_2011=courses[courses['year']==2011]
year_2011["is_paid"].value_counts()

In [None]:
year_2012=courses[courses['year']==2012]
year_2012["is_paid"].value_counts()

In [None]:
year_2013=courses[courses['year']==2013]
year_2013["is_paid"].value_counts()

In [None]:
year_2014=courses[courses['year']==2014]
year_2014["is_paid"].value_counts()

In [None]:
year_2015=courses[courses['year']==2015]
year_2015["is_paid"].value_counts()

In [None]:
year_2016=courses[courses['year']==2016]
year_2016["is_paid"].value_counts()

In [None]:
year_2017=courses[courses['year']==2017]
year_2017["is_paid"].value_counts()

In [None]:
courses.head(1)

In [None]:
courses_paid=courses[courses["price"]!=0]

In [None]:
courses_paid.head()

In [None]:
courses_paid=courses_paid.sort_values(by=["num_subscribers"],ascending=False)[0:15]

In [None]:
courses_paid

In [None]:
courses_paid_obj=courses_paid[["course_title","subject","price","year","num_subscribers"]]

In [None]:
courses_paid_obj

In [None]:
sns.barplot(x="num_subscribers",y="course_title",data=courses_paid_obj,hue="subject")

# Top Free Courses

In [None]:
courses_free=courses[courses["price"]==0]

In [None]:
courses_free=courses_free.sort_values(by=["num_subscribers"],ascending=False)
courses_free

In [None]:
courses_free_obj=courses_free[["course_title","subject","num_subscribers","price","year"]][0:15]

In [None]:
courses_free_obj

In [None]:
sns.barplot(x="num_subscribers",y="course_title",data=courses_free_obj,hue="subject")