In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import libraries

In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import missingno as msno
import plotly.figure_factory as ff
import datetime as dt

Read the data

In [None]:
df=pd.read_csv(r"/kaggle/input/udemy-courses/udemy_courses.csv")


Print first five row;

In [None]:
df.head().style.set_properties(**{"background-color":"lightblue","color":"blue"})

In [None]:
print("Number of observation",df.shape[0])
print("Number of Columns",df.shape[1])


#Dataset comprises of 3678 observations and 12 characteristics:

In [None]:
print("Information inside the data:")
df.info()

#Data has only 1 bool and 1 float or 5 int64 and 5 objects

#No variable column has null missing values

In [None]:
#Indetify the number of missing values in this dataset;
df.isnull().sum()

There is no null values

In [None]:
##Check missing value with the help of bar chart ;
msno.bar(df,color="lightpink")

In [None]:
##Get columns name;
df.columns

In [None]:
df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='viridis')\
                             .background_gradient(subset=['50%'], cmap='mako')


#The describer() function return the count,mean,standard deviation,min,max values

In [None]:
##Create a new variable Year:
df["published_timestamp"]=pd.to_datetime(df["published_timestamp"])
df["Year"]=df["published_timestamp"].dt.year

In [None]:
paid_or_not=pd.DataFrame(df["is_paid"].value_counts().reset_index(drop=False))
fig=px.pie(paid_or_not,values="is_paid",names=paid_or_not["index"],title="How many p% courses are free or not",
          color_discrete_sequence=px.colors.sequential.Plasma_r,width=400,height=400)
fig.show()

True means course is paid or False means course is free
So maximaum courses are paid

In [None]:
subject=pd.DataFrame(df["subject"].value_counts().reset_index())
fig=go.Figure(data=[go.Pie(labels=subject["index"],
                          values=subject["subject"],
                          hole=.7,
                          title="Which course higest percentage",
                          marker_colors=px.colors.sequential.RdBu,)])



fig.update_layout(title="Percentage of courses:")
fig.show()

In [None]:
year=df.groupby(["Year"])["course_id"].count().sort_values().reset_index()
year.rename(columns={"course_id":"Number of Courses"},inplace=True)
fig=px.bar(year,y="Number of Courses",x="Year",title="In which year was the most additional courses")
fig.show()

In [None]:
fig=px.box(df,x="content_duration",y="is_paid",orientation="h",color="is_paid",title="Duration Distribution across type of courses is paid or Not",
          color_discrete_sequence=["darkblue","black"])



fig.update_traces(quartilemethod="exclusive")
fig.update_xaxes(title="Which content is highly duration")
fig.update_yaxes(title="Is paid or Not")
fig.show()

In [None]:
fig=px.box(df,x="content_duration",y="subject",orientation="h",color="is_paid",
          title="Content duration according to subject and type of course",
          color_discrete_sequence=["#03cffc",'#eb03fc'])
fig.update_xaxes(title="content duration")
fig.update_yaxes(title="course subject")
fig.show()

In [None]:
fig=px.sunburst(df,path=["subject","content_duration"],
               values="is_paid",
               width=500,height=500,color_continuous_scale="RdYlGn",
               title="Which content is higly duration and paid ")
fig.show()

In [None]:
fig=px.funnel_area(names=df["subject"],
                  values=df["num_reviews"],
                  title="Number of Reviews",width=500,
                  height=500,color_discrete_sequence=["wheat","blue","black","wheat"])

fig.show()

In [None]:
fig=px.funnel_area(names=df["subject"],
                  values=df["num_lectures"],
                  title="Number of lectures",width=500,height=500,
                  color_discrete_sequence=["pink","wheat","blue","red"])


fig.show()

In [None]:
##Split the data two parts
free=df[df["price"]==0]
paid=df[df["price"]!=0]

In [None]:
def generate_df(df):
    new=df.groupby(["subject"]).agg({"course_id":"count"}).reset_index()
    new=new[new["course_id"]!=0]
    new.columns=["subject","counts"]
    new=new.sort_values("counts")
    return new

In [None]:
new_free= generate_df(free)
new_paid= generate_df(paid)

fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])
fig.add_trace(
    go.Pie(labels=new_free['subject'], values=new_free['counts']),
    row=1, col=1
)

fig.add_trace(
    go.Pie(labels=new_paid['subject'], values=new_paid['counts']),
    row=1, col=2
)

fig.update_traces(textposition='inside', hole=.4, hoverinfo="label+percent+name",
                  marker = dict(line = dict(color = "white", width = 2)))

fig.update_layout(
                  title_text="Which courses is free and Paid"
    ,annotations=[dict(text='Free ', x=0.18, y=0.5, font_size=17,
                       showarrow=False, font_color='black'),
                                                                           dict(text='Paid ', x=0.83, y=0.5, font_size=17, showarrow=False, font_color='black')])
fig.show()

In [None]:
level=pd.DataFrame(df["level"].value_counts().reset_index())
colorscale=[[0,"#272D31"],[.5,"#ffffff"],[1,"#ffffff"]]
fig=ff.create_table(level,colorscale=colorscale)

fig.show()

x=level["index"]
y=level["level"]


fig.add_trace(go.Scatter(x=x,y=y,
                        marker=dict(color="#0099ff"),
                        name="Which level has the most additional courses",
                        xaxis="x2",yaxis="y2"))



In [None]:
##Grouping the level according to subjects;
subject=df.groupby(["subject","level"])["course_id"].count().sort_values().reset_index()


In [None]:
fig=px.treemap(subject,path=["subject","level"],values="course_id",
              title="Which level has the most students:",width=600,height=600)

fig.show()

In [None]:
top25=df.sort_values("num_subscribers",ascending=False).head(25).sort_values("num_subscribers",ascending=True).reset_index(drop=True).reset_index()


In [None]:
fig = px.bar(top25,
               y    = 'index',
               x    = 'num_subscribers',
               orientation = 'h',
               color       = 'num_subscribers',
               hover_name  = 'course_title',
               title       = 'Top25 most popular courses and higest subscribers',
               opacity     = 0.8,
               color_continuous_scale = px.colors.sequential.Viridis,
               height = 800,
              )

fig.update_layout(showlegend=False)
fig.update_xaxes(title='Number of Subscribers')
fig.update_yaxes(title='Course Title',showticklabels=False)
fig.show()

In [None]:
fig=px.sunburst(
df,
path=["Year"],
values="num_subscribers",
color="Year",title="In which year most number of subscribers:")


fig.show()

##Thank You******