## **Live Streamed at** 

**https://www.twitch.tv/datascience_simpleyogurt**

In [None]:
#Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns 

np.warnings.filterwarnings('ignore')

In [None]:
udf=pd.read_csv('../input/finance-accounting-courses-udemy-13k-course/udemy_output_All_Finance__Accounting_p1_p626.csv')

In [None]:
#Display the first 3 rows
udf.head(3)

In [None]:
#Shape of the Dataset
udf.shape

In [None]:
#Display the column names
udf.columns

In [None]:
#Display the datatypes
udf.info()

In [None]:
udf.dtypes

In [None]:
#Columns missing values
udf.isna().any()

## Handling the Missing Values

In [None]:
udf[['discount_price__amount', 'discount_price__currency','discount_price__price_string', 
     'price_detail__amount','price_detail__currency', 'price_detail__price_string']]

In [None]:
#discount_price__currency unique values
udf['discount_price__currency'].unique()

In [None]:
#price_detail__currency unique values
udf['price_detail__currency'].unique()

In [None]:
udf[['discount_price__amount','discount_price__price_string', 
     'price_detail__amount', 'price_detail__price_string']].head(20)

### **Dropping the following columns**
* 'discount_price__currency'
* 'discount_price__price_string'
* 'price_detail__currency'
* 'price_detail__price_string'

In [None]:
udf.columns

In [None]:
udf.head(5)

In [None]:
udf[['url', 'avg_rating','avg_rating_recent', 'rating']]

In [None]:
udf[['url', 'avg_rating','avg_rating_recent', 'rating']].head(20)

In [None]:
rw=['avg_rating_recent', 'rating']
udf[rw].corr()

In [None]:
udf.loc[4500:4520,['avg_rating_recent', 'rating']]

### Dropping column url and rating

In [None]:
udf.columns

In [None]:
df=udf[['id', 'title', 'is_paid', 'num_subscribers', 'avg_rating',
        'avg_rating_recent', 'num_reviews', 'is_wishlisted',
        'num_published_lectures', 'num_published_practice_tests', 
        'created','published_time', 'discount_price__amount', 'price_detail__amount']]

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
udf.shape

In [None]:
df.isna().sum()

In [None]:
#Display the rows having null values
df[df['discount_price__amount'].isnull()].head()

In [None]:
df[df['discount_price__amount']==0]

In [None]:
#Imputing the nan values with 0
df['discount_price__amount']=df['discount_price__amount'].fillna(value=0)

In [None]:
#Null values check
df[df['discount_price__amount'].isnull()]

In [None]:
#Display the rows having null values
df[df['price_detail__amount'].isnull()].head()

In [None]:
df[df['price_detail__amount'].isnull()].shape

In [None]:
df[df['price_detail__amount'].isnull() & df['is_paid']==True]

In [None]:
#Dropping row with a paid course but no price detail
df.drop([13607],axis=0,inplace=True)

In [None]:
df[(df['price_detail__amount'].isnull()) & (df['is_paid']==False)]

In [None]:
df.isna().sum()

In [None]:
df['price_detail__amount']=df['price_detail__amount'].fillna(value=0)

In [None]:
df.isna().sum()

In [None]:
#Datatypes
df.dtypes

In [None]:
df.head(3)

In [None]:
df['created']=pd.to_datetime(df['created'])

In [None]:
df.dtypes

In [None]:
df['published_time']=pd.to_datetime(df['published_time'])

In [None]:
df.dtypes

## **Data Visualization**

In [None]:
df.head()

## **Free vs Paid Courses**

In [None]:
sns.countplot(df['is_paid'])

In [None]:
paid_courses=pd.DataFrame(df['is_paid'].value_counts())
paid_courses['% is_paid']=(paid_courses['is_paid'])/paid_courses['is_paid'].sum() *100
paid_courses

###  **Wishlisted Courses or not**

In [None]:
sns.countplot(df['is_wishlisted'])

In [None]:
is_wishlisted_courses=pd.DataFrame(df['is_wishlisted'].value_counts())
is_wishlisted_courses['% is_wishlisted']=(is_wishlisted_courses['is_wishlisted'])/is_wishlisted_courses['is_wishlisted'].sum() *100
is_wishlisted_courses

### **Correlation between columns**

In [None]:
udf.columns

In [None]:
continous_col=['num_subscribers', 'avg_rating','avg_rating_recent',
               'num_reviews','num_published_lectures', 'num_published_practice_tests',
               'discount_price__amount', 'price_detail__amount']
fig,ax=plt.subplots(figsize=(20,10))
sns.heatmap(df[continous_col].corr(),vmin=-1,vmax=1,annot=True,linewidths=0.5,ax=ax,cmap="YlGnBu")  
plt.xticks(rotation=10)
plt.show()

## **Correlation seems to be there in between**
* **'num_subscribers' & num_reviews**   0.78
* **'num_subscribers' & num_published_lectures** 0.21
* **price_detail__amount & num_subscribers** 0.12
* **avg_rating & price_detail__amount** 0.12
* **avg_rating & num_published_lectures** 0.12
* **num_published_lectures & price_detail__amount** 0.28
* **discount_price__amount & price_detail__amount** 0.17
* **num_reviews & num_published_lectures** 0.18

In [None]:
fig,ax=plt.subplots(2,2,figsize=(20,20))

sns.regplot(x='num_subscribers',y='num_reviews',data=df,ax=ax[0][0],color='g',scatter_kws={"alpha":0.3})
sns.regplot(x='num_subscribers',y='num_published_lectures',data=df,ax=ax[0][1],marker='+',scatter_kws={"alpha":0.3})
sns.regplot(x='num_subscribers',y='price_detail__amount',data=df,ax=ax[1][0],line_kws={"color":"black"},scatter_kws={"alpha":0.3})
sns.regplot(x='avg_rating',y='price_detail__amount',data=df,ax=ax[1][1],color='r',scatter_kws={"alpha":0.3},line_kws={"color":"black"})


In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(2,2,figsize=(20,20))

sns.regplot(x='avg_rating',y='num_published_lectures',data=df,ax=ax[0][0],color='g',scatter_kws={"alpha":0.3},line_kws={"color":"red"})
sns.regplot(x='num_reviews',y='num_published_lectures',data=df,ax=ax[0][1],color='r',scatter_kws={"alpha":0.3},line_kws={"color":"black"})
sns.regplot(x='num_published_lectures',y='price_detail__amount',data=df,ax=ax[1][0],marker='+',scatter_kws={"alpha":0.3})
sns.regplot(x='discount_price__amount',y='price_detail__amount',data=df,ax=ax[1][1],line_kws={"color":"black"},scatter_kws={"alpha":0.3})


In [None]:
df.columns

## **Difference between when courses created and published**

In [None]:
df['Days']=df['published_time']-df['created']
df['Days']=df['Days'].dt.days

In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(figsize=(8,8))
df['Days'].hist()
ax.set_xlabel("Number of Days between course created and published")
ax.set_ylabel("Number of courses")
plt.show()

## **Number of courses released per year**

In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(figsize=(8,8))
year_pub=pd.DataFrame(df['published_time'].dt.to_period('Y').value_counts())
year_pub=year_pub.sort_index()
sns.barplot(x=year_pub.index,y=year_pub['published_time'],ax=ax)
ax.set_ylabel("Number of Courses published in a year")
ax.set_xlabel("Year")
plt.show()


In [None]:
year_pub=df['published_time'].dt.to_period('Y').value_counts().sort_index(ascending=False)
year_pub

## **Number of Subscribers per year**

In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(figsize=(8,8))
df['year_pub']=df['published_time'].dt.to_period('Y').astype('str')
subs_year=df.groupby('year_pub')['num_subscribers'].sum()
ax.plot(subs_year)
ax.set_ylabel("Number of Subscribers per year")
ax.set_xlabel("Year")
plt.show()


## **Average course price per year**

In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(figsize=(8,8))
price_year=df.groupby('year_pub')['price_detail__amount'].mean()
ax.plot(price_year)
ax.set_ylabel("Average course prices per year")
ax.set_xlabel("Year")
plt.show()


In [None]:
price_year.sort_values(ascending=False)

## **Maximum course price per year**

In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(figsize=(8,8))
price_year=df.groupby('year_pub')['price_detail__amount'].max()
ax.plot(price_year)
ax.set_ylabel("Average course prices per year")
ax.set_xlabel("Year")
plt.show()


In [None]:
price_year.sort_values(ascending=False)

## **Median course price per year**

In [None]:
plt.style.use("fivethirtyeight")
fig,ax=plt.subplots(figsize=(8,8))
price_year=df.groupby('year_pub')['price_detail__amount'].median()
ax.plot(price_year)
ax.set_ylabel("Average course prices per year")
ax.set_xlabel("Year")
plt.show()


In [None]:
price_year.sort_values(ascending=False)

## **Top 5 Most Expensive Courses**

In [None]:
df.sort_values(by='price_detail__amount',ascending=False).head()

## **Top 5 Most Subscribed Courses**

In [None]:
df.sort_values(by='num_subscribers',ascending=False).head()

## **Top 5 Highest Rated Courses**

In [None]:
df.sort_values(by='avg_rating_recent',ascending=False).head()

## **Top 5 Highest Rated Free Courses**

In [None]:
df_free_courses=df[df['is_paid']==False]
df_free_courses.sort_values(by='avg_rating_recent',ascending=False).head()

## **Top 5 Highest Number of Lectures Courses**

In [None]:
df.sort_values(by='num_published_lectures',ascending=False).head()