In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from plotly.offline import init_notebook_mode, iplot 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py


# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv("/kaggle/input/youtube-trending-video-dataset/US_youtube_trending_data.csv")

In [None]:
data.head(5)

In [None]:
data.shape# exploring the dataset size

In [None]:
data.info()#Exploring data types of the dataset.

In [None]:
data.isna().sum()#Inspecting for null values in the dataset. 

In [None]:
data=data.dropna()#dropping the null values.

In [None]:
data.shape

In [None]:
for i in data.columns:
    print(i+":",len(data[str(i)].value_counts()))
    print("-------------------------------------")

In [None]:
data.columns

In [None]:
data["video_id"].value_counts()[:20]# we can clearly see that video_id's are repeated.
#Hence we will consdiered the video_id with most recent date as it contians cummulative comment and like count. 

In [None]:
#Converting the trending _date column to datetime column and creating new month and day columns.
data['trending_date']= pd. to_datetime(data['trending_date'])
data["month"]=pd. DatetimeIndex(data["trending_date"]).month
data["day"]=pd. DatetimeIndex(data["trending_date"]).day
data["week"]=pd. DatetimeIndex(data["trending_date"]).week

In [None]:
#Dropping redundant columns.
data=data.drop(columns=["thumbnail_link","channelId"],axis=1)

In [None]:
fdata=data.sort_values(by="trending_date").drop_duplicates(subset=["video_id"], keep="last")#just keeping lastest dated video id.

In [None]:
fdata.shape

In [None]:
fdata=fdata.sort_values(by=['view_count'],ascending=False)

top20views = fdata[:20]

fig = px.bar(top20views, x='channelTitle', y='view_count',color='view_count',
             hover_data=['view_count',"title"])

fig.update_xaxes(title_text='Channel title',title_font = {"size": 14},tickfont=dict(family='Rockwell', size=10))
fig.update_yaxes(title_text='View_count in Millions',title_font = {"size": 14},tickfont=dict(family='Rockwell', size=10))

fig.update_layout(title_text='View count by channel title')
fig.show()


In [None]:
cat_count=fdata.groupby("categoryId")["view_count"].mean()

fig = px.bar(cat_count, x=cat_count.index, y=cat_count.values ,color=cat_count.values)

fig.update_xaxes(title_text='Category Id',title_font = {"size": 14},tickfont=dict(family='Rockwell', size=10))
fig.update_yaxes(title_text='Average View Count',title_font = {"size": 14},tickfont=dict(family='Rockwell', size=10))

fig.update_layout(title_text='Average View count by Category id')
fig.show()

#### we can clearly see that category 10 has the highest average view count overall. 

In [None]:
fdata['publishedAt']= pd. to_datetime(fdata['publishedAt'])

fdata["Published_day"]=pd. DatetimeIndex(fdata["publishedAt"]).day
fdata["Published_week"]=pd. DatetimeIndex(fdata["publishedAt"]).week
fdata["Published_time"]=pd. DatetimeIndex(fdata["publishedAt"]).time
fdata["Published_weekday"]=pd. DatetimeIndex(fdata["publishedAt"]).weekday

### Relationship between published weekday and Views?

In [None]:
day_count=fdata.groupby("Published_weekday")["view_count"].mean()
color_code=["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]
fig = px.scatter(day_count, x=day_count.index, y=day_count.values, color=color_code,
                 color_discrete_sequence=px.colors.qualitative.Vivid,
                 size=day_count.values*10)


fig.update_xaxes(title_text='Weekdays',title_font = {"size": 14},tickfont=dict(family='Rockwell', size=10))
fig.update_yaxes(title_text='Average View Count',title_font = {"size": 14},tickfont=dict(family='Rockwell', size=10))

fig.update_layout(title_text='Average View count by Weekdays')

fig.show()

In [None]:
fdata.columns

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english')) 

fdata['title'] = fdata.title.apply(lambda x: word_tokenize(x))
fdata['title'] = fdata.title.apply(lambda x: [w for w in x if w not in stop_words])
fdata['title'] = fdata.title.apply(lambda x: ' '.join(x))

fdata['tags'] = fdata.tags.apply(lambda x: word_tokenize(x))
fdata['tags'] = fdata.tags.apply(lambda x: [w for w in x if w not in stop_words])
fdata['tags'] = fdata.tags.apply(lambda x: ' '.join(x))

fdata['description'] = fdata.description.apply(lambda x: word_tokenize(x))
fdata['description'] = fdata.description.apply(lambda x: [w for w in x if w not in stop_words])
fdata['description'] = fdata.description.apply(lambda x: ' '.join(x))

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

Youtube = np.array(Image.open('../input/youtube-mask-logo/youtube logo.jpg'))

In [None]:
def WordCloudfunction(title,text):
    cloudtext=' '.join(fdata[text].tolist())
    sns.set(rc={'figure.figsize':(16,10)})
    wordcloud = WordCloud(mask=Youtube,background_color="white").generate(cloudtext)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.title(title,size=24)
    plt.show()

In [None]:
WordCloudfunction('Tags','tags')

In [None]:
WordCloudfunction('Video title','title')

#### Official Music, Minecraft, Shorts, Fortnite, Challenge are most used title words by trending youtubers. 

In [None]:
WordCloudfunction('Description','description')

In [None]:
WordCloudfunction('channelTitle','channelTitle')

#### Sports, Music channels are on the top of trending list. 

In [None]:
Popular_words = ['tiktok','shorts','pranks',"news","sports","music","official","reallife"]

Popular_words_count = dict((x,0) for x in Popular_words)
for i in Popular_words_count:
    x = fdata['tags'].str.contains(i).sum()
    if i in Popular_words:
        Popular_words_count[i] = x
        
print(Popular_words_count)