In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

I am doing the analysis for Trending Data in India. Same analysis can be done for remaninig countries by just reading the different csv file.

In [None]:
df = pd.read_csv("/kaggle/input/youtube-trending-video-dataset/IN_youtube_trending_data.csv")
df.sample(5)

In this notebook, Let's try to find some insights from the Youtube Trending Dataset.
I am trying to find answers for following Questions.
1. Which Category Videos are being watched and are in trending for most of the times? (Categories such as: Movies, Education,Videoblogging, etc)
2. What's the average time taken for a video to be in trending categorically?
3. What's the average no.of views required to enter into trending?
4. Which Youtube channel in India has more videos in Trending?
5. Understanding the significance of Likes,Dislikes and Comments.

Let's read the Categorical Json file which can used later on for analysis

In [None]:
import re
k = open("/kaggle/input/youtube-trending-video-dataset/IN_category_id.json").read()
Category_list = re.findall(r"title\": \"(.*)\"", k)
categories = {i+1:Category_list[i] for i in range(len(Category_list))}
categories

In [None]:
df.describe()

In [None]:
df.info()

Let's check the no.of records for a random video: wzpLZWWEA4w

In [None]:
df[df.video_id=="wzpLZWWEA4w"]

Details inferred:
1. Same video can be there multiple times in dataset as the video can stay in trending for many consequtive days.
2. View Count,Likes and Dislikes Change as the time prolongs. So we shouldn't do group by on a Video_Id and add all the view_Counts,Likes,Dislikes as those will be repeated.

Though there are 61474 entries, no.of unique videos are: 17411

In [None]:
df.video_id.nunique()

2219 Unique Channels

In [None]:
df.channelTitle.nunique()

In [None]:
df.sample()

Removing unnecessary columns for further analysis

In [None]:
df.drop(["channelId","thumbnail_link","ratings_disabled","comments_disabled","description","tags"], inplace=True, axis=1)

For further Analysis, Let's gather enough details


df_maxViews stores the last occurance of VideoID. 
Intuition: Gathering final views,likes,comments before relegating from Trending data.

In [None]:
df_maxViews = df.sort_values('view_count', ascending=False).drop_duplicates(['video_id'])
df_maxViews.sample(5)

df_minViews stores the data of a video entering into trending videos for the first time

In [None]:
df_minViews = df.sort_values('view_count', ascending=True).drop_duplicates(['video_id'])
df_minViews.sample(5)

Let's have the Total count of a video for it being in Trending

In [None]:
NoOfDaysInTrending = dict(df.groupby('video_id')['view_count'].count())

Let's append the categories to all dataframes we used till now

In [None]:
def fetch_video_category(value):
    return categories[value]
df_maxViews["Video_Category"]=df_maxViews.categoryId.apply(fetch_video_category)
df_minViews["Video_Category"]=df_minViews.categoryId.apply(fetch_video_category)
df["Video_Category"]=df.categoryId.apply(fetch_video_category)

df.sample(5)
    

Let's see the total count of videos with respect to Categories

In [None]:
df.Video_Category.value_counts()

Let's start analysing the data

In [None]:
df_Views_Analysis = df_maxViews.groupby('Video_Category')["view_count"].sum().reset_index()
df_Like = df_maxViews.groupby('Video_Category')["likes"].sum().reset_index()
df_Dislike = df_maxViews.groupby('Video_Category')["dislikes"].sum().reset_index()
df_Comments = df_maxViews.groupby('Video_Category')["comment_count"].sum().reset_index()
df_Views_Analysis["Likes"] = df_Like.likes
df_Views_Analysis["Dislikes"] = df_Dislike.dislikes
df_Views_Analysis

# View Count vs Type of Videos

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("View Count vs Type of Videos")
plt.xticks(rotation=90)

sns.barplot(x=df_Views_Analysis.Video_Category, y=df_Views_Analysis.view_count)

# Add label for vertical axis
plt.ylabel("View Count")

The Above Data infers:
1. All the videos uploaded are categorised to 14 Types.
2. Out of all, **People & Blogs and Family has highest view count**, Likes and Dislikes.

But is that right way of analysis?
1. Dislikes would be more as there are more views. Shouldn't we consider the percentage ?
2. Does highest views say that the public are highly interested in watching those Category videos?

Let's dive deep further to understand even more


**Usually, In most cases, viewers just watch the video and turn to other video without Liking/Disliking/Commenting. But if they responded to a video in like/comment/dislike then that does signifies something and we need to capture that.**

Let me create a Response percentage column which signifies the public response to a video and engagement b/w viewers and the video

In [None]:
df_Views_Analysis["Response_Percentage"] = ( df_Views_Analysis.Likes + df_Views_Analysis.Dislikes ) * 100 /df_Views_Analysis.view_count
df_Views_Analysis

# Public Response vs Type of Videos

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Public Response vs Type of Videos")
plt.xticks(rotation=90)

sns.barplot(x=df_Views_Analysis.Video_Category, y=df_Views_Analysis.Response_Percentage)

# Add label for vertical axis
plt.ylabel("Public Response(in percentage)")

So now,
We can see that Viewers are highly reacting to Action/Adventure , Classics, Drama Videos. Let's randomly see what those videos are

In [None]:
df_maxViews[df_maxViews.Video_Category.isin(["Action/Adventure","Classics","Drama"])].sample(20)

Let's create Similar Percentages and see what they infer

In [None]:
df_Views_Analysis["LikesPercentage"] = df_Views_Analysis.Likes*100/df_Views_Analysis.view_count
df_Views_Analysis["DislikesPercentage"] = df_Views_Analysis.Dislikes*100/df_Views_Analysis.view_count
df_Views_Analysis["Comments"] = df_Comments["comment_count"]
df_Views_Analysis["CommentsPercentage"] = df_Views_Analysis.Comments*100/df_Views_Analysis.view_count
df_Views_Analysis

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))

df_Views_Analysis.plot(x="Video_Category", y=["LikesPercentage", "DislikesPercentage", "CommentsPercentage"], kind="bar",figsize=(20,10))


In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("ViewCount vs Type of Videos")
plt.xticks(rotation=90)

sns.barplot(x=df_Views_Analysis.Video_Category, y=df_Views_Analysis.view_count)

# Add label for vertical axis
plt.ylabel("ViewCount")

# CommentsPercentage vs Type of Videos

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("CommentsPercentage vs Type of Videos")
plt.xticks(rotation=90)

sns.barplot(x=df_Views_Analysis.Video_Category, y=df_Views_Analysis.CommentsPercentage)

# Add label for vertical axis
plt.ylabel("Comments")

From above graphs, We can understand that people tend to comment more over Action/Adventure Category Videos. Let's see few such data

In [None]:
df_maxViews[df_maxViews.Video_Category.isin(["Action/Adventure"])].sample(10)

# DislikesPercentage vs Type of Videos

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))
plt.xticks(rotation=90)

# Add title
plt.title("DislikesPercentage vs Type of Videos")

sns.barplot(x=df_Views_Analysis.Video_Category, y=df_Views_Analysis.DislikesPercentage)

# Add label for vertical axis
plt.ylabel("Dislikes")

Seems like viewers disliked videos of type Foerign,Film and Animation. Let's see few of them

In [None]:
df_maxViews[df_maxViews.Video_Category.isin(["Foreign","Film & Animation"])].sample(10)

# LikesPercentage vs Type of Videos

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))
plt.xticks(rotation=90)

# Add title
plt.title("LikesPercentage vs Type of Videos")

sns.barplot(x=df_Views_Analysis.Video_Category, y=df_Views_Analysis.LikesPercentage)

# Add label for vertical axis
plt.ylabel("Likes")

Highest like percentage is for Action/Adventure, Classics and Drama

# Channel Analysis

In [None]:
df_Channel = df_maxViews.groupby('channelTitle')["view_count"].sum().reset_index()
df_Like = df_maxViews.groupby('channelTitle')["likes"].sum().reset_index()
df_Dislike = df_maxViews.groupby('channelTitle')["dislikes"].sum().reset_index()
df_Comments = df_maxViews.groupby('channelTitle')["comment_count"].sum().reset_index()
df_Channel["Likes"] = df_Like.likes
df_Channel["Dislikes"] = df_Dislike.dislikes
df_Channel["Comments"] = df_Comments.comment_count

df_Channel.sample(10)

In [None]:
df_Channel.sort_values('view_count', ascending=False).iloc[:25,:]

# Channel vs total views

In [None]:
Channel_Views = df_Channel.sort_values('view_count', ascending=False).iloc[:25,:]
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Channel vs total views")
plt.xlabel
plt.xticks(rotation=90)

sns.barplot(x=Channel_Views.channelTitle, y=Channel_Views.view_count)

# Add label for vertical axis
plt.ylabel("Total Views")

# Channel vs total Likes

In [None]:
Channel_Views = df_Channel.sort_values('Likes', ascending=False).iloc[:25,:]
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Channel vs total Likes")
plt.xlabel
plt.xticks(rotation=90)

sns.barplot(x=Channel_Views.channelTitle, y=Channel_Views.Likes)

# Add label for vertical axis
plt.ylabel("Total Likes")

# Channel vs total Comments

In [None]:
Channel_Views = df_Channel.sort_values('Comments', ascending=False).iloc[:25,:]
# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Channel vs total Comments")
plt.xlabel
plt.xticks(rotation=90)

sns.barplot(x=Channel_Views.channelTitle, y=Channel_Views.Comments)

# Add label for vertical axis
plt.ylabel("Total Comments")

Let's try some different analysis now

In [None]:
def fetch_video_times(value):
    return NoOfDaysInTrending[value]

Let's try to find the Channels that have their videos in trending for most of the times.

In [None]:
df_maxViews["times_in_trending"] = df_maxViews.video_id.apply(fetch_video_times)
df_maxViews.sample(5)

Top 25 Videos that are in trending for many days 

In [None]:
df_trend_video_tt = df_maxViews.sort_values('times_in_trending', ascending=False).iloc[:25,:]
df_trend_video_tt

Top Channels whose video is in trending for most of the times

# Channel's Video vs no.of times in Trending

In [None]:
#Top 20 videos with most no.of times in trending(Channel wise)

# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Channels vs no.of times in Trending")
plt.xlabel
plt.xticks(rotation=90)

sns.barplot(x=df_trend_video_tt.channelTitle, y=df_trend_video_tt.times_in_trending)
#sns.violinplot(x = df_trend_video_tt.channelTitle, y = df_minViews.DaysTakenToBeOnTrending )

# Add label for vertical axis
plt.ylabel("no.of trending")

Top 30 Channels whose Videos are in Trending

In [None]:
df_Channel_Trending = df_maxViews.groupby("channelTitle")["times_in_trending"].sum().reset_index()
df_Channel_Trending["Majority_Videos_Category_Type"] = df_maxViews.groupby('channelTitle')['Video_Category'].agg(pd.Series.mode).reset_index()["Video_Category"]
df_Channel_Trending = df_Channel_Trending.sort_values('times_in_trending', ascending=False).iloc[:30,:]
df_Channel_Trending

# Channel vs total no.of times in trending

In [None]:

# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Channel vs total no.of times in trending")
plt.xlabel
plt.xticks(rotation=90)

sns.barplot(x=df_Channel_Trending.channelTitle, y=df_Channel_Trending.times_in_trending)

# Add label for vertical axis
plt.ylabel("no.of times in trending")

Category of the Videos Uploaded of Top Channels which are in Trending

# Category vs total no.of times in trending

In [None]:

# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Category vs total no.of times in trending")
plt.xlabel
plt.xticks(rotation=90)
Category_Trending = df_Channel_Trending.groupby("Majority_Videos_Category_Type")["times_in_trending"].sum().reset_index()
sns.barplot(x=Category_Trending.Majority_Videos_Category_Type, y=Category_Trending.times_in_trending)

# Add label for vertical axis
plt.ylabel("no.of times in trending")

In [None]:
pd.Timestamp(df_minViews.trending_date[1])

Let's create a column which calculates no.days took for the video to be in Trending.
0 being less than 24hrs

In [None]:
df_minViews["JoinedTrending"] = pd.to_datetime(df_minViews.trending_date.str.split(',\s*').str[0])
df_minViews["UploadedTime"] = pd.to_datetime(df_minViews.publishedAt.str.split(',\s*').str[0])
df_minViews["DaysTakenToBeOnTrending"] = (df_minViews["JoinedTrending"] - df_minViews["UploadedTime"]).dt.days
df_minViews['DaysTakenToBeOnTrending'] = df_minViews['DaysTakenToBeOnTrending'].apply(lambda x: 0 if x == -1 else x)
df_minViews.sample(10)

# Average no.of days took to be in trending

In [None]:
#Top 20 videos with most no.of times in trending(Channel wise)

# Set the width and height of the figure
plt.figure(figsize=(20,10))

# Add title
plt.title("Average no.of days took to be in trending")
plt.xlabel
plt.xticks(rotation=90)

sns.barplot(x=df_minViews.Video_Category, y=df_minViews.DaysTakenToBeOnTrending)
#sns.violinplot(x = df_minViews.Video_Category, y = df_minViews.DaysTakenToBeOnTrending )

# Add label for vertical axis
plt.ylabel("Avg no.of days")

(Will Add further analysis soon)