In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Set up libraries**

In [2]:
#load libraries 

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import geopy.distance
from math import radians,cos,sin,asin,sqrt
import folium
import datetime
from folium.plugins import HeatMap
from scipy.stats import ttest_ind

matplotlib.rcParams.update({'font.size': 12})

# **Load data**

In [3]:
!ls ../input/youtube-trending-video-dataset/US_youtube_trending_data.csv

In [4]:
us_data = pd.read_csv('../input/youtube-trending-video-dataset/US_youtube_trending_data.csv')

# ****Explore Data****

In [5]:
#read the first 5 rows
us_data.head(5)

In [6]:
#read the last 5 rows
us_data.tail(5)

The dataset includes all of the videos trending from 2020-08-12 to today's date (2022-03-05)

Hence, let's use only data from 2021-01-01 to 2021-12-31 to have a better year-round analysis.

In [7]:
type(us_data.loc[2,'publishedAt'])

In [8]:
type(us_data.loc[6,'trending_date'])

**The type of both of publishAt and trending_date are str.**

Convert them into date/time before making any changes

In [9]:
us_data['publishedAt'] = pd.to_datetime(us_data['publishedAt'])

In [10]:
us_data['trending_date'] = pd.to_datetime(us_data['trending_date'])

In [11]:
us_data

In [12]:
#select only videos started to trend from 2021-01-01 to 2021-12-31
us_2021_data = us_data[us_data["trending_date"].dt.year == 2021]
us_2021_data


Let us divide each day in existing **publishedAt and trending_date** column into 24 smaller bins of 1 hour each

This will allow us to visualize the time series more precisely.

# **Visualize data**

First, let's take a big picture of data distributed each month in 2021

In [13]:
plt.figure(figsize = (15,8))
p = us_2021_data['trending_date'].dt.month.value_counts().sort_index().plot(kind='bar', color ='#96A480')
plt.title('Trending date Month Distribution')
for item in plt.gca().get_xticklabels():
    item.set_rotation(45)
plt.xlabel('Month')
plt.ylabel('Number of videos')

We could clearly see that **June** is the month having highest number of videos.

One interesting observation is that each of the months before June 2021 has **slightly lower** number of videos compared to each of the months after June 2021

Let's have a more closer look at the data, say every day in 2021

In [14]:
plt.figure(figsize = (15,8))
us_2021_data['trending_date'].value_counts().sort_index().plot(color='#96A480')
plt.title('Trending videos Day Distribution')
plt.xlabel('Month')
plt.ylabel('Number of videos')
ax = plt.gca()
ax.set_xlim(['2021-01-01','2021-12-31'])
ax.set_ylim([150,450])

The underlying trend is clearly visible now. It shows that:
* There are tiny decreases in February and April
* There are a big jump in number of videos from late February to early March and from end of May to mid of June

**Question:** Which time periods correspond to the highest and lowest peaks in the plot?

In [15]:
us_2021_data['trending_date'].value_counts()

As observed from the plot and the data above, we have more details about the data:
* The highest number of trending videos in one day is 400 videos in from late February to early March and around mid of June
* The lowest number of trending videos in one day is 197-199 videos in early February

**Rearranging dataset for date and month analysis**

In [16]:
#Separating the date to another column
us_2021_data['Date'] = us_2021_data['trending_date'].dt.day
us_2021_data['Date']


In [17]:
#Separating the month to another column
us_2021_data['Month'] = us_2021_data['trending_date'].dt.month
us_2021_data['Month']
us_2021_data

In [18]:
us_month_data = us_2021_data.groupby(['Month','Date']).count()['trending_date']
us_month_data

In [19]:
#Unstacking data to create a heatmap
us_month_data= us_month_data.unstack(level = 0)
us_month_data

In [20]:
plt.figure(figsize = (15,15))
sns.heatmap(us_month_data, vmin = 0, cmap='Greens')
plt.title = 'Heatmap of trending video counts in day vs month grid'
plt.show()

The heatmap indicates that the maximum number of videos occured specifically on **2/24, 3/2, and from 5/31 to 6/17**

We can also tell why February being the month having lowest number of trending videos is because it has the least number of days (NaN values)

# Explore YouTube channels performance

In [21]:
#Counting total number of videos trending in 2021
index = us_data.index
number_of_rows = len(index)
print("There are total {0}".format(number_of_rows) + " videos on trending in 2021")

#Counting total number of DISTINCT channel trending in 2021
print("There are {0}".format(us_data['channelTitle'].nunique()) + " channels on trending in 2021")

This results show that there are many YouTube channels that stay on trending more than one time.

In [22]:
#Creating a new video count dataframe 
us_channel = us_data['channelTitle'].value_counts()
us_channel = pd.DataFrame(us_channel).reset_index()
us_channel = us_channel.rename(columns = {'channelTitle': 'video_counts', 'index':'channelTitle'})
us_channel

**Question:** Which channel stay on trending the most in 2021?

In [23]:
#Counting number of videos on trending of each YouTube channel
print("Number of times on Youtube trending for each YouTube channel are:\n{0}".format(us_data['channelTitle'].value_counts()))

**Question:** How many channels hit the trending on YouTube twice or more in 2021? 

Let's call them **potential channels**

In [24]:
#Select channels stayed on trending for 2 or more days in 2021
us_potential_channel = us_channel[us_channel['video_counts'] >=2]
us_potential_channel
index = us_potential_channel.index
num_of_channel = len(index)
print("There are {0} ".format(num_of_channel) + "potential channels in 2021 and they are\n{0}".format(us_potential_channel))

How about channels that hit the trending only one time? Let's call them **one time channels**

**Question:** How many one time channel are there?

In [25]:
us_one_time_channel = us_channel[us_channel['video_counts'] == 1]
us_one_time_channel
index = us_one_time_channel.index
num_of_channel = len(index)
print("There are {0}".format(num_of_channel) + " one-time channels in 2021 and they are\n{0}".format(us_one_time_channel))

Besides, we could see that there are several YouTube channel that stayed on trending more than 365 times, which means that these channels had more than two videos stay on trending in the same day. Let's call them **viral channels**

**Question:** Which and How many viral channels are there?

In [26]:
#Select only channel having 365 or more video count
us_viral_channel = us_channel[us_channel['video_counts'] >= 365]
us_viral_channel
index = us_viral_channel.index
num_of_channel = len(index)
print("There are {0}".format(num_of_channel) + " viral channels in 2021 and they are\n{0}".format(us_viral_channel))