In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
pd.set_option('float_format', '{:.2f}'.format)
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
trend_videos=pd.read_csv('/kaggle/input/youtube-trending-video-dataset/US_youtube_trending_data.csv')

In [None]:
trend_videos.info()

In [None]:
trend_videos.head()

In [None]:
# columns with null values 
trend_videos.isnull().sum()

In [None]:
# there are 284 null values for description column, hence filling them with "Video has no description"

trend_videos["description"] = trend_videos["description"].fillna(value="Video has no description")

# Statistics for numerical & non-numerical columns

In [None]:
# stats for numerical columns
trend_videos.describe()

In [None]:
# stats for non-numerical columns
trend_videos.describe(include = ['O'])

In [None]:
# Ratio of videos published by year
trend_videos["publishedAt"].apply(lambda x: x[:4]).value_counts(normalize=True)

In [None]:
# Ratio of videos with comments disabled and enabled
trend_videos["comments_disabled"].value_counts(normalize=True)

In [None]:
# Ratio of videos with ratings disabled and enabled
trend_videos["ratings_disabled"].value_counts(normalize=True)

# Correlation Matrix

In [None]:
figure = plt.subplots(figsize=(10,6))
corr_map = sns.heatmap(trend_videos.corr(), annot=True, cmap="YlGnBu")

# Top 10 Channels having highest trending videos

In [None]:
# Top 10 Channels having highest trending videos

videos_by_channel = trend_videos.groupby("channelTitle").size().reset_index(name="no_of_videos") \
    .sort_values("no_of_videos", ascending=False).head(10)
fig, ax = plt.subplots(figsize=(10,6))
vbc_plot = sns.barplot(x="no_of_videos", y="channelTitle", data=videos_by_channel, palette="YlGnBu_r")
vbc_plot = ax.set(xlabel="Number of videos", ylabel="Channel")
plt.title("Top 10 Trending Video Channels \n")
plt.show()



# Top 10 Categories having highest trending videos

In [None]:
# Top 10 Categories having highest trending videos
# category names are not available in dataset 

videos_by_category = trend_videos.groupby("categoryId").size().reset_index(name="no_of_videos") \
    .sort_values("no_of_videos", ascending=False).head(10)
fig, ax = plt.subplots(figsize=(10,6))
vbcat_plot = sns.barplot(x="no_of_videos", y="categoryId", data=videos_by_category, palette="YlGnBu_r", orient = 'h')
vbcat_plot = ax.set(xlabel="Number of Videos", ylabel="Category")
plt.title("Top 10 Trending Video Categories\n")
plt.show()



# Top 10 Trending videos in the category having highest trending videos

In [None]:
# From the previous plot, it seems that category 10 hosted the highest number of trending videos
category10_videos = trend_videos[["title","view_count"]].sort_values(by="view_count",ascending=False).drop_duplicates("title",keep="first")
category10_top10 = category10_videos.nlargest(10, ["view_count"])

fig, ax = plt.subplots(figsize=(10,8))
top10_plot = sns.barplot(data=category10_top10, x='view_count', y='title', palette="YlGnBu_r", ax=ax, ci=None)
top10_plot = ax.set(xlabel="Number of views", ylabel="Title of the Video")
plt.ticklabel_format(axis="x", style="plain")
plt.title("Top 10 Trending videos in the category having highest trending videos\n")
plt.show()


# Impact of video published hour on trending videos

In [None]:
# extract published hour from the publishedAt 
trend_videos["publish_hour"] = trend_videos["publishedAt"].apply(lambda x: x[11:13])

# plot no of videos published by hour 
videos_by_pub_hour = trend_videos.groupby("publish_hour").size().reset_index(name="no_of_videos")
    
fig, ax = plt.subplots(figsize=(8,8))
vph_plot = sns.barplot(x="publish_hour", y="no_of_videos", data=videos_by_pub_hour,palette="YlGnBu")
vph_plot = ax.set(xlabel="Hour in which the video was published", ylabel="Number of Videos")
plt.title("Trending Videos by Published hour\n")
plt.show()

In [None]:
# plot videos published by hour and category
videos_by_cat_pub_hour=trend_videos.groupby(["categoryId","publish_hour"]).count()["video_id"].unstack()

fig, ax = plt.subplots(figsize=(10,8))
vcph_plot = sns.heatmap(videos_by_cat_pub_hour, cmap="YlGnBu")
vcph_plot = ax.set(xlabel="Hour in which video was published", ylabel="Category")
plt.title("Trending Videos by Category & Publish hour\n")
plt.show()

# 