# Udemy Finance courses - EDA

## Importing Modules and Data

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.gridspec as gridspec
%matplotlib inline

In [None]:
inp0 = pd.read_csv("../input/finance-accounting-courses-udemy-13k-course/udemy_output_All_Finance__Accounting_p1_p626.csv")

Inspecting the data frame and looking at the data types

In [None]:
#glance at the columns and types of the data
inp0.info()

In [None]:
#glance at the data
inp0.head()

## Data Cleaning

### Removing the redundent columns

Removing the columsn id, url, discount_price_currency, price_detail_currency as they are not usefull to our further analysis

In [None]:
inp0.drop(["id","url","discount_price__currency","price_detail__currency","is_wishlisted"],axis = 1,inplace = True)
inp0.drop(["discount_price__price_string","price_detail__price_string"],axis = 1,inplace = True)
inp0.drop('created',axis = 1, inplace = True)

### Number of Missing values

In [None]:
mis_val = 100*inp0.isnull().sum()/inp0.shape[0]
mis_val = mis_val.reset_index(name = 'Mis_perc')

plt.figure(figsize = [8,5])
plt.barh(mis_val['index'], mis_val['Mis_perc'])
a = np.arange(0,12,2)
b = ["{}%".format(i) for i in a]
plt.xticks(a,b)
plt.xlabel("$Percentage of Missing Values$")
plt.ylabel("$Attributes in dataset$")
plt.title("Missing Values comparision across attribues")
plt.show()

Dealing with the missing values in 6 columns and upon closer inspection we can see that where ever the course is freely available  `price_detail_amount`, `discount_price_amount` columns contains null values and when there is no discount offered on the course the last three columns shows null. So, we will fill the price column with 0

And creating a column Discount, which gives the percentage of the discount

In [None]:
inp0.price_detail__amount.fillna(value = 0, inplace = True)

inp0['Discount'] = 100*(inp0['price_detail__amount'] - inp0["discount_price__amount"])/inp0['price_detail__amount'] if inp0["discount_price__amount"] is not np.nan else np.nan

### Adding Year and Month Columns

In [None]:
inp0['PublishedYear'] = inp0.published_time.apply(lambda x: x[:4])
inp0['PublishedMonth'] = inp0.published_time.apply(lambda x: x[5:7])
inp0.drop('published_time',axis = 1, inplace = True)

## Univariate Analysis

In this segment we will study the behaviours of the single variables

#### Number of Subscribers

The number of subscribers span from `0` to almost `370000` with the median `533`

In [None]:
fig = plt.figure(figsize = [10,6])
gs = gridspec.GridSpec(3,3)
f_ax1 = fig.add_subplot(gs[0, :])
plt.sca(f_ax1)
sns.boxplot(x = 'num_subscribers',data = inp0)
plt.title("Distribution for number of subscribers")
f_ax2 = fig.add_subplot(gs[1:,:])
plt.sca(f_ax2)
sns.distplot(inp0['num_subscribers'],bins = 100, hist_kws = {"edgecolor":"white"})
plt.ylabel('density')
plt.show()

Most popular courses with more than 150k subscribers are

In [None]:
inp0[inp0.num_subscribers > 150000]["title"]

#### Paid vs Free

Out of total courses we can notice that most of them (`96.3`) are paid

In [None]:
inp0.is_paid.value_counts(normalize = True).plot.pie(figsize = [5,5], explode = [0.2,0.] )
plt.show()

### Average Rating

The average rating for most of the courses span from around `2.7` to `5`. A pattern can be observed from the distribution plot that people mostly rate when they like the course expalins why more mass is concentrated towards rights and there is also quite a number of ratings at 0. We can draw a conclusion that people tend to rate when they like(towards higher rating) or hate the course(towards low rating)

In [None]:
fig = plt.figure(figsize = [10,6])
gs = gridspec.GridSpec(3,3)
f_ax1 = fig.add_subplot(gs[0, :])
plt.sca(f_ax1)
sns.boxplot(x = 'avg_rating',data = inp0)
plt.title("Distribution for average rating")
f_ax2 = fig.add_subplot(gs[1:,:])
plt.sca(f_ax2)
sns.distplot(inp0['avg_rating'],bins = 10, hist_kws = {"edgecolor":"white"})
plt.ylabel('density')
plt.show()

### Number of reviews

Number of reviews like number of subscribers is highly skewed with minimum at `0`, maximum at `78k` and median is `24`

In [None]:
fig = plt.figure(figsize = [10,6])
gs = gridspec.GridSpec(3,3)
f_ax1 = fig.add_subplot(gs[0, :])
plt.sca(f_ax1)
sns.boxplot(x = 'num_reviews',data = inp0)
plt.title("Distribution for number of reviews")
f_ax2 = fig.add_subplot(gs[1:,:])
plt.sca(f_ax2)
sns.distplot(inp0['num_reviews'],bins = 100, hist_kws = {"edgecolor":"white"})
plt.ylabel('density')
plt.show()

### Prices for the Courses

There are 96.3% paid courses, the distribution of the prices is shown below. The price ranges from the `0` to `12800` with the median value `3200` rupees

In [None]:
fig = plt.figure(figsize = [10,6])
gs = gridspec.GridSpec(3,3)
f_ax1 = fig.add_subplot(gs[0, :])
plt.sca(f_ax1)
sns.boxplot(x = 'price_detail__amount',data = inp0)
plt.title("Distribution for price of the courses")
f_ax2 = fig.add_subplot(gs[1:,:])
plt.sca(f_ax2)
sns.distplot(inp0['price_detail__amount'],bins = 50, hist_kws = {"edgecolor":"white"})
plt.ylabel('density')
plt.show()

### Discount

The discount on the courses ranging from `40%` to `95%`

In [None]:
fig = plt.figure(figsize = [10,6])
gs = gridspec.GridSpec(3,3)
f_ax1 = fig.add_subplot(gs[0, :])
plt.sca(f_ax1)
sns.boxplot(x = 'Discount',data = inp0)
plt.title("Distribution for Discount percentages")
plt.show()

## Number of courses published per years

There courses published increases gradually with maximum number published is in `2018` and `2020`

In [None]:
inp0.PublishedYear.value_counts().plot.barh(figsize = [8,4])
plt.title('Count of the courses published')
plt.show()

## Number of courses published per month

The maximum number of courses are published during April, May during the holidays, and the least number of courses published during christmas

In [None]:
inp0.PublishedMonth.value_counts().plot.barh(figsize = [8,4],width = 0.5)
plt.title('Count of the courses published')
plt.show()

### Correlation between variables

In [None]:
res = inp0[['num_subscribers','avg_rating','num_reviews','num_published_lectures','price_detail__amount','Discount']].corr()

sns.heatmap(res, annot = True, cmap = 'RdYlGn')
plt.show()

We observe that there is high correlation between number of reviews and number of subscribers and as the price for a course goes up we observe that the discount is also goes usually up

In [None]:
def q5(x):
    return np.quantile(x,0.05)
def q90(x):
    return np.quantile(x,0.9)

In [None]:
fig = plt.figure(figsize = [10,10])
gs = gridspec.GridSpec(2,2)
f_ax1 = fig.add_subplot(gs[0, 0])
plt.sca(f_ax1)
sns.boxplot(x = 'is_paid',y = 'avg_rating',data = inp0)
plt.title("Rating for paid and free courses")
f_ax2 = fig.add_subplot(gs[0,1])
plt.sca(f_ax2)
sns.boxplot(x = 'is_paid', y = 'num_published_lectures', data = inp0)
plt.title("Number of lectures for free and paid lectures")
f_ax3 = fig.add_subplot(gs[1,0])
sns.barplot(x = 'is_paid', y = 'avg_rating', data = inp0, estimator = q5)
plt.title("5th Quantile Rating for paid and free courses")
f_ax4 = fig.add_subplot(gs[1,1])
sns.barplot(x = 'is_paid',y = 'num_published_lectures', data = inp0, estimator = q90)
plt.title("90th Quantile Lectures for paid and free courses")
plt.show()

Average rating are usually better for free courses and number of lectures are high for the paid courses

### Binning the number of lectures to low, medium, high

In [None]:
inp0['lec_cat'] = pd.qcut(inp0.num_published_lectures, q = [0, .25, .75, 1], labels = ['low','medium','high'])

fig = plt.figure(figsize = [11,10])
gs = gridspec.GridSpec(2,2)
ax1 = fig.add_subplot(gs[0,0])
plt.sca(ax1)
sns.boxplot(x = 'lec_cat', y = 'avg_rating', data = inp0)
ax2 = fig.add_subplot(gs[0,1])
plt.sca(ax2)
sns.boxplot(x = 'lec_cat', y = 'price_detail__amount', data = inp0)
f_ax3 = fig.add_subplot(gs[1,0])
sns.barplot(x = 'lec_cat', y = 'avg_rating', data = inp0, estimator = q5)
plt.title("5th Quantile Rating for lecture categories")
plt.show()

As the number of lectures increases the rating usually increases

### More to Come!

### This is my first python notebook, please support, thanks