# Live Session's Notebook

This is a part of a live session joint analysis conducted by myslef (Tal Mizrachi), with participents from various sources, and of course, with the full cooperation of Appleseed Academy Data Analysis course - Check'em out!

You can find the video here: https://youtu.be/MKfF25Teyfo

In [None]:
#Importing the relevant packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Reading the file into memory

In [None]:

#You can start by checking only the first rows with **nrows** argument, like so:
# df = pd.read_csv('udemy_courses.csv', nrows=5)

df = pd.read_csv('udemy_courses.csv')

In [None]:
#Looking at the data from a bird's point of view 

df.describe()

In [None]:
# If we are looking at course prices - courses that are free distort our data
df[df['price']>0].describe()


In [None]:
#Check the total distribution of paid to free courses 
df['is_paid'].value_counts()

In [None]:
# We can also visualize it, but I want to give you a word of warning - 
# Too many visualization might distract you - when analyzing - 
# try to visualize stuff that you can't see through the numbers
df['is_paid'].value_counts().plot(kind='bar')

In [None]:
# Prive distribution, we can see that the assumption of Normality doesn't apply
df['price'].plot(kind='hist')

In [None]:
# Because dates can be represented in many ways - if we want to 
# work on a date column where the time has any meaning - This is a good start:

df['published_timestamp'] = pd.DatetimeIndex(df['published_timestamp'])


In [None]:
# We can see the Maximum and Minimum dates in our dataset, we checked that becuase we talked about COVID-19
# Which leads us to an important point - Whatever we are experiencing NOW is not necessarily be fully relevant to the dataset 
print(df['published_timestamp'].dt.strftime('%Y-%m').max())
print(df['published_timestamp'].dt.strftime('%Y-%m').min())

In [None]:
#using the inplace argument is not best practice in my eyes - try to either avoid it or use it with care 
# and know that it changes the dataframe itself

# df.drop('url', axis=1, inplace=True)


In [None]:
#Sampling 5 rows
df.sample(5)

In [None]:
#Let's look for duplicates 
# Note - we could use .drop_duplicates() but let's say that we want to KNOW which courses repeat themselves
double_courses = pd.DataFrame(df['course_id'].value_counts()>=2)
double_courses_index = double_courses[double_courses['course_id']==True].index

#We are also sorting by title, just so we get a better grasp
df[df['course_id'].isin(double_courses_index)].sort_values('course_title')

In [None]:
print(f"Before dropping duplicates, this is the length - {len(df)}")
df.drop_duplicates(inplace=True)
print(f"After dropping duplicates, this is the length - {len(df)}")

In [None]:
#Just making sure...
df['course_id'].value_counts()

In [None]:
#Looking for unique subjects
df['subject'].unique()

In [None]:
#Looking for the number of unique subjects

df['subject'].nunique()

In [None]:
# Let's try to see when courses were published
subject_monthly = df[['published_timestamp','subject']].copy()

In [None]:
subject_monthly['month'] = subject_monthly['published_timestamp'].dt.strftime('%Y-%m')

In [None]:
#Let's see how many course were published per date
subject_monthly[['subject','month']].groupby('month').size()

In [None]:
# We can also do this
subject_monthly[['subject','month']].groupby('month').count()

In [None]:
# How can we get to this format?
    sub_1,sub_1..,subN
month1
motnh2
motnh3

In [None]:
#Thanks to Ziv Gostinskey we found another way
subject_monthly_unstack =  subject_monthly[['subject','month']].groupby(['subject', 'month']).size().unstack().T

In [None]:
subject_monthly_unstack

In [None]:
subject_monthly_unstack.plot(figsize=(12,8))

In [None]:
#Another way:
subject_monthly_pivot_table = subject_monthly[['subject','month']].pivot_table(index='month', columns='subject', aggfunc=len)

In [None]:
subject_monthly_pivot_table.plot(figsize=(12,8))

In [None]:
# I had a typo here - don't yell at me :)
## Occam's razor - the simplest explanation is usually the right one


##Read more https://en.wikipedia.org/wiki/Occam%27s_razor

In [None]:
#Let's try on a daily basis

subject_monthly['day'] = subject_monthly['published_timestamp'].dt.strftime('%Y-%m-%d')
subject_monthly[['subject','day']].groupby(['subject', 'day']).size().unstack().T.plot(figsize=(12,8))

#Looks like way too much for us

In [None]:
#Let's work it on yearly basis
subject_monthly['year'] = subject_monthly['published_timestamp'].dt.strftime('%Y')
subject_monthly[['subject','year']].groupby(['subject', 'year']).size().unstack().T.plot(figsize=(12,8))

In [None]:
#Feel free to reach out at - talnmizrachi@gmail.com