In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

In [2]:
youtube = pd.read_csv('./data/most_subscribed_youtube_channels.csv')

In [3]:
# Not a super big df
youtube.shape

(1000, 7)

In [4]:
# When checking for nan values I see there are not many. So I decided to drop them.
display(youtube.head(2))
print(youtube.isna().sum())

youtube = youtube.dropna()
print(youtube['category'].isna().sum())

# Also, let's put all the column names in lowercase
youtube.columns = youtube.columns.str.lower()
youtube.head(2)

Unnamed: 0,rank,Youtuber,subscribers,video views,video count,category,started
0,1,T-Series,222000000,198459090822,17317,Music,2006
1,2,YouTube Movies,154000000,0,0,Film & Animation,2015


rank            0
Youtuber        0
subscribers     0
video views     0
video count     0
category       27
started         0
dtype: int64
0


Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,222000000,198459090822,17317,Music,2006
1,2,YouTube Movies,154000000,0,0,Film & Animation,2015


In [5]:
# I am going to check dtypes. First things first!
youtube.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 973 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         973 non-null    int64 
 1   youtuber     973 non-null    object
 2   subscribers  973 non-null    object
 3   video views  973 non-null    object
 4   video count  973 non-null    object
 5   category     973 non-null    object
 6   started      973 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 60.8+ KB


In [6]:
youtube['video views'] = youtube['video views'].str.replace(',','')
youtube['video count'] = youtube['video count'].str.replace(',','')
youtube['subscribers'] = youtube['subscribers'].str.replace(',','')

In [7]:
youtube.head(2)

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,222000000,198459090822,17317,Music,2006
1,2,YouTube Movies,154000000,0,0,Film & Animation,2015


In [8]:
youtube['video views'] = youtube['video views'].astype(int)
youtube['video count']= youtube['video count'].astype(int)
youtube['subscribers']= youtube['subscribers'].astype(int)

In [9]:
youtube.head()

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,222000000,198459090822,17317,Music,2006
1,2,YouTube Movies,154000000,0,0,Film & Animation,2015
2,3,Cocomelon - Nursery Rhymes,140000000,135481339848,786,Education,2006
3,4,SET India,139000000,125764252686,91271,Shows,2006
5,6,PewDiePie,111000000,28469458228,4497,Gaming,2010


In [10]:
youtube.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 973 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         973 non-null    int64 
 1   youtuber     973 non-null    object
 2   subscribers  973 non-null    int64 
 3   video views  973 non-null    int64 
 4   video count  973 non-null    int64 
 5   category     973 non-null    object
 6   started      973 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 60.8+ KB


In [44]:
# This scatter plot wants to see if there's a linear relation between the amount of videos posted and
# the amount of views gained over time.

fig = px.scatter(youtube, x='video count', y='subscribers', title='Video count and subscribers relation.')
fig.show()

fig = px.scatter(youtube, x='video count', y='video views', title='Video count and video views relation.')
fig.show()

In [42]:
fig = px.scatter(youtube, x='video count', y='subscribers', color='category', title='Video count and subscribers relation per category')
fig.show()

fig = px.scatter(youtube, x='video count', y='video views', color='category', title='Video count and video views relation per category')
fig.show()

# We can see that news and politics channels and entertainment usually produce more video but that doesn't mean that they gain more subscribers.

In [45]:
youtube['category'].unique()

array(['Music', 'Film & Animation', 'Education', 'Shows', 'Gaming',
       'Entertainment', 'People & Blogs', 'Sports', 'Howto & Style',
       'News & Politics', 'Comedy', 'Trailers', 'Nonprofits & Activism',
       'Science & Technology', 'Movies', 'Pets & Animals',
       'Autos & Vehicles', 'Travel & Events'], dtype=object)

In [64]:
for value, count in youtube['category'].value_counts().items():
    print(value, count)

# As we can see most of our data comes from Entertainment, Music, People & Blogs, Gaming and Comedy channels.

Entertainment 241
Music 222
People & Blogs 119
Gaming 102
Comedy 63
Film & Animation 52
Education 46
Howto & Style 45
News & Politics 27
Science & Technology 18
Shows 14
Sports 10
Pets & Animals 6
Trailers 2
Nonprofits & Activism 2
Movies 2
Autos & Vehicles 1
Travel & Events 1


In [46]:
# Let's check for outliers... This is going to be interesting