In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# creates a dictionary that maps `category_id` to `category`
import json
id_to_category = {}

with open('/kaggle/input/trending-youtube-videos-by-category/videoCategories.json', 'r') as f:
    data = json.load(f)
    for category in data['items']:
        id_to_category[category['id']] = category['snippet']['title']

In [None]:
#convert to datetime
def convert_todate(df):
    df['publishedAt'] = pd.to_datetime(df['publishedAt']).dt.date
    df['trending_date'] = pd.to_datetime(df['trending_date'],format='%y.%d.%m').dt.date

In [None]:
import datetime
from datetime import date, timedelta

In [None]:
#dataframe is all the data we have before calling youtube API
dataframe = pd.read_csv('/kaggle/input/trending-youtube-videos-by-category/21.01.07_VN_videos.csv')
convert_todate(dataframe)
dataframe.drop_duplicates(subset='video_id', keep="last", inplace = True) # drop duplicate (video_id)

#map category to categoryID
dataframe['categoryId'] = dataframe.categoryId.astype(str)
dataframe.insert(6, 'category', dataframe.categoryId.map(id_to_category))

In [None]:
#now_data is the dataset that we have when calling youtube API
now_data = pd.read_csv('/kaggle/input/trending-youtube-videos-by-category/21.02.07_VN_videos.csv')
convert_todate(now_data)
now_data.drop_duplicates(subset='video_id', keep="last", inplace = True) # drop duplicate (video_id)

#map category to categoryID
now_data['categoryId'] = now_data.categoryId.astype(str)
now_data.insert(6, 'category', now_data.categoryId.map(id_to_category))

In [None]:
#add now_data into dataframe
dataframe = pd.concat([dataframe,now_data])

In [None]:
#creating data_trending_today (data only contains values of "today" )

today = dataframe.trending_date.max()
yesterday = today - datetime.timedelta(days=1)

data_trending_today = dataframe[dataframe['trending_date'] == today]
data_trending_today.rename(columns={'view_count':'view_count_t',
                                   'likes':'likes_t',
                                   'dislikes':'dislikes_t',
                                   'comment_count' :'comment_count_t'},
                          inplace = True)

In [None]:
# creating data_trending (data only contains values of "today" and "yesterday")
data_trending = pd.merge(data_trending_today,dataframe[dataframe['trending_date'] == yesterday].iloc[:,[0,9,10,11,12]],on=['video_id'],how='left')

data_trending['growth_view'] = round(((data_trending['view_count_t'] - data_trending['view_count'])/data_trending['view_count'])*100,2)
data_trending['growth_likes'] = round(((data_trending['likes_t'] - data_trending['likes'])/data_trending['likes'])*100,2)
data_trending['growth_dislikes'] = round(((data_trending['dislikes_t'] - data_trending['dislikes'])/data_trending['dislikes'])*100,2)
data_trending['growth_comt'] = round(((data_trending['comment_count_t'] - data_trending['comment_count'])/data_trending['comment_count'])*100,2)
data_trending.growth_comt.replace({float(np.inf):100}, inplace = True)

data_trending['total_growth'] = data_trending['growth_view'] + data_trending['growth_likes'] + 0.5*data_trending['growth_dislikes'] + data_trending['growth_comt']

# **1. VISUALIZATION FOR TRENDING TODAY**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [None]:
#barchart of  no.of video trending per youtube channel ---- TOP 10

df_channel = data_trending_today.groupby(['channelTitle']).agg(count_video = ('video_id','count'),
                                                                   total_views = ('view_count_t','sum')).sort_values('count_video',ascending = False).reset_index().head(10)
sns.barplot(x='count_video', y='channelTitle', data=df_channel)

In [None]:
#barchart of  no.of video trending per category ---- TOP 5

df_channel = data_trending_today.groupby(['category']).agg(count_video = ('video_id','count'),
                                                                   total_views = ('view_count_t','sum')).sort_values('count_video',ascending = False).reset_index().head(5)
ax = sns.barplot(x='count_video', y='category', data=df_channel)

# **2. VISUALIZATION FOR TODAY'S TRENDING vs YESTERDAY**

In [None]:
#New Video is on trending today
data_trending[data_trending.growth_view.isnull()].iloc[:,[1,9,10,11,12]].sort_values('view_count_t',ascending = False).reset_index(drop = True)


In [None]:
#TOP 10 video having highest growth rate of total_growth (views + likes + 0,5 x dislikes + comt )
ax = sns.barplot(x='total_growth', y='title', data=data_trending.sort_values('total_growth',ascending = False).head(10))

In [None]:
#TOP 10 video having highest growth rate of views (today's views vs yesterday's)
ax = sns.barplot(x='growth_view', y='title', data=data_trending.sort_values('growth_view',ascending = False).head(10))

In [None]:
#TOP 10 video having highest growth rate of likes 
ax = sns.barplot(x='growth_likes', y='title', data=data_trending.sort_values('growth_likes',ascending = False).head(10))

In [None]:
#TOP 10 Channel having highest growth rate of total_growth (views + likes + 0,5 x dislikes + comt )
ax = sns.barplot(x='mean_total_growth', y='channelTitle', 
                 data=data_trending.groupby(['channelTitle']).agg(mean_total_growth = ('total_growth','mean')).reset_index().sort_values('mean_total_growth',ascending = False).head(10))

In [None]:
#TOP 10 CATEGORY having highest growth rate of total_growth (views + likes + 0,5 x dislikes + comt )
ax = sns.barplot(x='mean_total_growth', y='category', 
                 data=data_trending.groupby(['category']).agg(mean_total_growth = ('total_growth','mean')).reset_index().sort_values('mean_total_growth',ascending = False).head(10))