In [12]:
import json
import pandas as pd
import re
from pyspark import SparkContext

In [18]:
#sc.stop()
sc = SparkContext()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/18 10:04:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [19]:
# "Data is included for the IN, US, GB, DE, CA, FR, RU, BR, MX, KR, and JP regions 
# (India, USA, Great Britain, Germany, Canada, France, Russia, Brazil, Mexico, South Korea, and, Japan respectively)"
# please enter region in two letter format (like first line) and in quotation on variable region
region = "US"

#display_limit = 1000

In [20]:
from os import listdir

data_files_location = {}
categories = {}

for f in listdir('./data'):
    if f.endswith('.json'):
        temp = json.load(open(f'./data/{f}'))
        for i in temp['items']:
            categories[i['id']] = i['snippet']['title']
    if f.endswith('.csv'):
        data_files_location[f[:2]] = f'./data/{f}'

if not categories:
    raise FileNotFoundError('can not locate any json category files')
    
if not data_files_location:
    raise FileNotFoundError('no CSV data files are located')

### Clean up the DataFrame 
- Load and assign CSV files onto variables with Pandas
- Remove unnecessary columns
- Map the categories from the .json file to 'categoryId' in DataFrame as a new column, labeled 'videoCategory'

In [21]:
data = pd.read_csv(data_files_location[region])
data.drop(['thumbnail_link', 'comments_disabled', 'ratings_disabled'], inplace=True, axis=1)

data['videoCategory'] = data['categoryId'].astype(str).map(categories)

data

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,description,videoCategory
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...,People & Blogs
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,"While running her own modding shop, Ramya Pare...",Gaming
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,I left youtube for a month and this is what ha...,Entertainment
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11T16:38:55Z,UCbg_UMjlHJg_19SZckaKajg,XXL,10,2020-08-12T00:00:00Z,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,Subscribe to XXL → http://bit.ly/subscribe-xxl...,Music
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11T15:10:05Z,UCDVPcEbVLQgLZX0Rt6jo34A,Mr. Kate,26,2020-08-12T00:00:00Z,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45802,964,2196,Transforming The LaBrant Family's empty white ...,Howto & Style
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87586,FdzPV69i4Aw,Great Idea 💡 #shorts #tips #short,2021-10-12T12:00:12Z,UCGLDtG2tl0uG8P0eNstbLUA,Tool_Tips,26,2021-10-17T00:00:00Z,[None],3241407,27687,9037,937,,Howto & Style
87587,40_wdosHJaQ,I Was In The Funniest Minecraft Competition Wi...,2021-10-12T00:15:10Z,UCFbZ2e9IrPejOdp8wsKUxvA,QuackiTwo,24,2021-10-17T00:00:00Z,Quackity|Quackitwo|Quackity Second Channel|Qua...,425760,48882,215,1238,STREAMED LIVE ON TWITCH: https://www.twitch.tv...,Entertainment
87588,cNG1QNQPlNE,Summoner Changes | FFXIV Endwalker Media Tour,2021-10-13T12:00:29Z,UCQjKMGUEzBmEHltb1OIMLUg,Larryzaur,20,2021-10-17T00:00:00Z,ffxiv|final fantasy xiv|ff14|final fantasy 14|...,203634,7065,52,754,Endwalker Media Tour Playlist ► https://www.yo...,Gaming
87589,iYTK_1Sq8ZQ,Reaper Overview | FFXIV Endwalker Media Tour,2021-10-13T12:00:32Z,UCQjKMGUEzBmEHltb1OIMLUg,Larryzaur,20,2021-10-17T00:00:00Z,ffxiv|final fantasy xiv|ff14|final fantasy 14|...,210207,6884,43,540,Endwalker Media Tour Playlist ► https://www.yo...,Gaming


### Simple Analysis of Dataset
Many videos have duplicate entries in this dataset. Here, we filter out the duplicates to obtain the most recent number of view counts, likes, dislikes, and comment counts. We also sort the data to get the number of view counts for each video at the time it first begins trending.  

This dataset contains:
- a total of 87,591 entries
- 15,180 unique videos
- 4,252 unique channels

In [22]:
# number of unique videos in dataset
data.video_id.nunique()

15180

In [23]:
# number of unique channels in dataset
data.channelId.nunique()

4252

In [24]:
data.channelTitle.nunique()

4310

In [25]:
# obtain the most recent entry with the highest view count
final_views = data.sort_values('view_count', ascending=False).drop_duplicates(['video_id'])
final_views

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,description,videoCategory
56374,WMweEpGlu_U,BTS (방탄소년단) 'Butter' Official MV,2021-05-21T03:46:13Z,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,10,2021-05-30T00:00:00Z,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,264407389,16021534,150989,6738537,BTS (방탄소년단) 'Butter' Official MV Credits: Dire...,Music
3358,gdZLi9oWNZg,BTS (방탄소년단) 'Dynamite' Official MV,2020-08-21T03:58:10Z,UC3IZKseVpdzPSBaWxBxundA,Big Hit Labels,10,2020-08-28T00:00:00Z,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,232649205,15735533,714194,6065230,BTS (방탄소년단) 'Dynamite' Official MVCredits:Dire...,Music
73564,hdmx71UjBXs,Turn into orbeez - Tutorial #Shorts,2021-07-03T04:04:57Z,UCt8z2S30Wl-GQEluFVM8NUw,FFUNTV,24,2021-08-08T00:00:00Z,[None],206202284,6840430,240769,2826,Turn into orbeez - Tutorial #ShortsHey guys! W...,Entertainment
4980,vRXZj0DzXIA,BLACKPINK - 'Ice Cream (with Selena Gomez)' M/V,2020-08-28T04:00:11Z,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,10,2020-09-05T00:00:00Z,YG Entertainment|YG|와이지|K-pop|BLACKPINK|블랙핑크|블...,184778248,11795670,879354,2735997,BLACKPINK - ‘Ice Cream (with Selena Gomez)’Com...,Music
68979,CuklIb9d3fI,BTS (방탄소년단) 'Permission to Dance' Official MV,2021-07-09T03:59:12Z,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,10,2021-07-16T00:00:00Z,HYBE|HYBE LABELS|하이브|하이브레이블즈,156482499,12117314,102132,2781218,BTS (방탄소년단) 'Permission to Dance' Official MVC...,Music
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15289,r7nYQXsxJdU,HBCU Homecoming 2020: Meet Me On The Yard,2020-10-25T01:40:31Z,UCqVDpXKLmKeBU_yyt_QkItQ,YouTube Originals,24,2020-10-27T00:00:00Z,2 CHAINZ|DESI BANKS|LIONEL RICHIE |LANCE GROSS...,0,4257,384,240,HBCU Homecoming 2020 is going down! We welcome...,Entertainment
65049,kmk5vciFbek,Demi Lovato performs their greatest hits this ...,2021-06-26T00:07:19Z,UCZkURf9tDolFOeuw_4RD7XQ,Demi Lovato,24,2021-06-27T00:00:00Z,Pride|Pride 2021|YouTube Pride|YouTube Pride 2...,0,0,0,138,"On June 25, celebrate Pride 2021 with Demi Lov...",Entertainment
74964,Hb3rmh-_FMw,Introducing the shorter side of YouTube,2021-08-10T15:04:25Z,UCBR8-60-B28hp2BmDPdntcQ,YouTube,27,2021-08-15T00:00:00Z,[None],0,22585,1636,0,*****EPILEPSY WARNING ********Watch and create...,Education
73992,HcSwBJY7Xew,Watch The Weeknd and create short videos on th...,2021-08-10T15:08:22Z,UCBR8-60-B28hp2BmDPdntcQ,YouTube,27,2021-08-11T00:00:00Z,[None],0,50071,7256,0,,Education


In [26]:
# obtain the view counts as video first enters into trending videos
initial_views = data.sort_values('view_count', ascending=True).drop_duplicates(['video_id'])
initial_views

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,description,videoCategory
50455,hAxqygRdM4g,Earth Day 2021 Doodle,2021-04-21T11:00:14Z,UCdq61m8s_48EhJ5OM_MCeGw,GoogleDoodles,1,2021-04-25T00:00:00Z,[None],0,0,0,0,This year’s annual Earth Day Doodle highlights...,Film & Animation
74492,Hb3rmh-_FMw,Introducing the shorter side of YouTube,2021-08-10T15:04:25Z,UCBR8-60-B28hp2BmDPdntcQ,YouTube,27,2021-08-13T00:00:00Z,[None],0,22030,1604,0,*****EPILEPSY WARNING ********Watch and create...,Education
15289,r7nYQXsxJdU,HBCU Homecoming 2020: Meet Me On The Yard,2020-10-25T01:40:31Z,UCqVDpXKLmKeBU_yyt_QkItQ,YouTube Originals,24,2020-10-27T00:00:00Z,2 CHAINZ|DESI BANKS|LIONEL RICHIE |LANCE GROSS...,0,4257,384,240,HBCU Homecoming 2020 is going down! We welcome...,Entertainment
65263,ifJYb2An7wE,Gay And Not Proud - Daniel Howell | YouTube Pr...,2021-06-25T21:04:38Z,UCGjylN-4QCpn8XJ1uY-UOgA,Daniel Howell,24,2021-06-28T00:00:00Z,Pride|Pride 2021|YouTube Pride|YouTube Pride 2...,0,0,0,784,"On June 25, join Daniel Howell as he celebrate...",Entertainment
19967,AWXvClaRtsI,Celebrating Maria Tallchief,2020-11-14T22:39:13Z,UCdq61m8s_48EhJ5OM_MCeGw,GoogleDoodles,10,2020-11-20T00:00:00Z,[None],0,0,0,0,In honor of Native American Heritage Month in ...,Music
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1800,gdZLi9oWNZg,BTS (방탄소년단) 'Dynamite' Official MV,2020-08-21T03:58:10Z,UC3IZKseVpdzPSBaWxBxundA,Big Hit Labels,10,2020-08-21T00:00:00Z,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,57229275,7045907,165420,2314087,BTS (방탄소년단) 'Dynamite' Official MVCredits:Dire...,Music
65398,Fw7fbKoK3e8,MvRyhan Funny videos #tiktok #Shorts,2021-06-25T07:37:36Z,UCcFQLco2CA2uq9J2Uwcoi6Q,Mv Ryhan,24,2021-06-29T00:00:00Z,[None],59410897,1176893,39704,4850,#shorts,Entertainment
55605,WMweEpGlu_U,BTS (방탄소년단) 'Butter' Official MV,2021-05-21T03:46:13Z,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,10,2021-05-21T00:00:00Z,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,67111752,7110071,8998,3400291,BTS (방탄소년단) 'Butter' Official MV Credits: Dire...,Music
51,hsm4poTWjMs,Cardi B - WAP feat. Megan Thee Stallion [Offic...,2020-08-07T04:00:10Z,UCxMAbVFmxKUVGAll0WVGpFw,Cardi B,10,2020-08-12T00:00:00Z,Cardi B|Cardi|Atlantic Records|rap|hip hop|tra...,76805026,2820345,382578,270263,Cardi B - WAP feat. Megan Thee StallionStream/...,Music


In [27]:
# find the number of times a video has been in top-trending videos
video_trend_count = data[['video_id']].value_counts()

# the right-most column contains the trend count of each unique video
video_trend_count.to_frame()

Unnamed: 0_level_0,0
video_id,Unnamed: 1_level_1
hdmx71UjBXs,36
aONT7atzqfw,35
mcUpeaSX8BE,34
wY6UyatwVTA,34
vrRfBmMWpLY,33
...,...
z58k37I4Crs,1
1fNQP597qkM,1
rr85rpSi0yI,1
-MnXYZCKrc0,1


In [28]:
# find the number of times a channel has been in top-trending videos
channel_trend_count = data[['channelId']].value_counts()
channel_trend_count.to_frame()

Unnamed: 0_level_0,0
channelId,Unnamed: 1_level_1
UCWJ2lWNubArHWmf3FIHbfcQ,550
UCDVYQ4Zhbm3S2dlz7P1GBDg,528
UCpB959t8iPrxQWj7G6n0ctQ,458
UCIPPMRA040LQr5QPyJEbmXA,377
UCke6I9N4KfC968-yRcd5YRg,336
...,...
UC4Llz_FxdMEU0iWO4aG_Xjg,1
UCHmv6QMb4xerJ8Tt5UBHjaw,1
UCulu1KUYJ_2OBDouc_d-cUA,1
UCLCCG11dS-HyB3up3OCRV_w,1


#### Categories
Here, we take a look at the amount of videos that have trended in each category. The entertainment category has the most trending videos, and nonprofits & activism category has the fewest.

In [29]:
category_count = data.videoCategory.value_counts()
category_count

Entertainment            17615
Music                    16552
Gaming                   14458
Sports                    9574
People & Blogs            7524
Comedy                    4721
Film & Animation          3364
News & Politics           3315
Science & Technology      3199
Howto & Style             2933
Education                 1870
Autos & Vehicles          1556
Pets & Animals             474
Travel & Events            351
Nonprofits & Activism       85
Name: videoCategory, dtype: int64