# Preprocessing (Joint)

In [1]:
import pandas as pd
countries = ['CAvideos','DEvideos','FRvideos','GBvideos','INvideos','JPvideos','KRvideos','MXvideos','RUvideos','USvideos']
df_dict={}
for country in countries:
    df_dict[country] = pd.read_csv("../Youtube_data/"+country+".csv", encoding = 'utf-8').drop_duplicates(subset='video_id', keep="last")
    df_dict[country]["region"] = country[:2]
    print(country,df_dict[country].shape)
    print("views:",df_dict[country]['views'].sum())
    print()

CAvideos (24427, 17)
views: 20410495101

DEvideos (29627, 17)
views: 14237965000

FRvideos (30581, 17)
views: 10575960316

GBvideos (3272, 17)
views: 15733754312

INvideos (16307, 17)
views: 11146317224

JPvideos (12912, 17)
views: 2912931256

KRvideos (15876, 17)
views: 5924564502

MXvideos (33513, 17)
views: 9302980511

RUvideos (34282, 17)
views: 7045179834

USvideos (6351, 17)
views: 12461406596



In [2]:
#Unique channel in each area
for country in df_dict.keys():
    print("Unique channels in",country,':',len(set(df_dict[country]['channel_title'])))

#Concat df of each region
df = pd.concat([df_dict[country] for country in df_dict.keys()])
df.shape


Unique channels in CAvideos : 5065
Unique channels in DEvideos : 6079
Unique channels in FRvideos : 6670
Unique channels in GBvideos : 1611
Unique channels in INvideos : 1422
Unique channels in JPvideos : 4616
Unique channels in KRvideos : 3982
Unique channels in MXvideos : 6921
Unique channels in RUvideos : 6866
Unique channels in USvideos : 2198


(207148, 17)

In [3]:
df.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'region'],
      dtype='object')

# Category

In [4]:
import json
categories = ['CA_category_id','DE_category_id','FR_category_id','GB_category_id','IN_category_id','JP_category_id','KR_category_id','MX_category_id','RU_category_id','US_category_id']
categories_json={}
for category in categories:
    f = open("../Youtube_data/"+category+".json")
    data = json.load(f)
    categories_json[category[:2]]=data
    f.close()

In [5]:
categories_json['CA']['items']

[{'kind': 'youtube#videoCategory',
  'etag': '"ld9biNPKjAjgjV7EZ4EKeEGrhao/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
  'id': '1',
  'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
   'title': 'Film & Animation',
   'assignable': True}},
 {'kind': 'youtube#videoCategory',
  'etag': '"ld9biNPKjAjgjV7EZ4EKeEGrhao/UZ1oLIIz2dxIhO45ZTFR3a3NyTA"',
  'id': '2',
  'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
   'title': 'Autos & Vehicles',
   'assignable': True}},
 {'kind': 'youtube#videoCategory',
  'etag': '"ld9biNPKjAjgjV7EZ4EKeEGrhao/nqRIq97-xe5XRZTxbknKFVe5Lmg"',
  'id': '10',
  'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
   'title': 'Music',
   'assignable': True}},
 {'kind': 'youtube#videoCategory',
  'etag': '"ld9biNPKjAjgjV7EZ4EKeEGrhao/HwXKamM1Q20q9BN-oBJavSGkfDI"',
  'id': '15',
  'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
   'title': 'Pets & Animals',
   'assignable': True}},
 {'kind': 'youtube#videoCategory',
  'etag': '"ld9biNPKjAjgjV7EZ4EKeEGrhao/9GQMSRjrZdHe

In [6]:
def catIDToString(cat, reg):
    #print(cat, reg)
    try:
        return next(item for item in categories_json[reg]['items'] if item["id"] == str(cat))['snippet']['title']
    except:
        return str(cat)

df['category_id'] = df.apply(lambda r: catIDToString(r['category_id'], r['region']), axis=1)
set(df['category_id'])

{'29',
 'Autos & Vehicles',
 'Comedy',
 'Education',
 'Entertainment',
 'Film & Animation',
 'Gaming',
 'Howto & Style',
 'Movies',
 'Music',
 'News & Politics',
 'Nonprofits & Activism',
 'People & Blogs',
 'Pets & Animals',
 'Science & Technology',
 'Shows',
 'Sports',
 'Trailers',
 'Travel & Events'}

In [7]:
from datetime import datetime
import numpy as np
df['publish_time'] = df.apply(lambda r: datetime.strptime(r['publish_time'],'%Y-%m-%dT%H:%M:%S.%fZ'), axis=1)
df['channel_rank'] = df.groupby('channel_title')['views'].transform(np.sum).rank(ascending=False, method='dense')

In [9]:
df.to_csv('youtube_all_district.csv', index=False)

# tags for word cloud

In [7]:
import pandas as pd
df=pd.read_csv('youtube_all_district.csv')
df.head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,region,channel_rank
0,0yIWz1XEeyc,17.14.11,Jake Paul Says Alissa Violet CHEATED with LOGA...,DramaAlert,News & Politics,2017-11-13 07:37:51,"#DramaAlert|""Drama""|""Alert""|""DramaAlert""|""keem...",1309699,103755,4613,12143,https://i.ytimg.com/vi/0yIWz1XEeyc/default.jpg,False,False,False,► Follow for News! - https://twitter.com/KEEMS...,CA,154.0
1,FyZMnhUtLfE,17.14.11,猎场 | Game Of Hunting 12【TV版】（胡歌、張嘉譯、祖峰等主演）,大劇獨播,Film & Animation,2017-11-12 16:00:01,"電視劇|""大陸電視劇""|""猎场""|""职场""|""商战""|""爱情""|""都市""|""胡歌""|""陈龙""...",158815,218,30,186,https://i.ytimg.com/vi/FyZMnhUtLfE/default.jpg,False,False,False,Thanks for watching the drama! Help more peopl...,CA,767.0


In [51]:

def tagsList(tagString, region, category):
    try:
        tagList = tagString.replace('"','').split("|")
        tagDict = []
        for tag in tagList:
            tagDict.append({'tag':tag,'region':region,'category':category})

        return tagDict
    except:
        return []

df_tag = df.apply(lambda r: tagsList(r['tags'], r['region'], r['category_id']), axis=1)
df_tag

0         [{'tag': '#DramaAlert', 'region': 'CA', 'categ...
1         [{'tag': '電視劇', 'region': 'CA', 'category': 'F...
2         [{'tag': 'punjabi songs', 'region': 'CA', 'cat...
3         [{'tag': 'prank', 'region': 'CA', 'category': ...
4         [{'tag': 'Graham Norton', 'region': 'CA', 'cat...
                                ...                        
207143    [{'tag': 'aarons animals', 'region': 'US', 'ca...
207144    [{'tag': '[none]', 'region': 'US', 'category':...
207145    [{'tag': 'I gave safiya nygaard a perfect hair...
207146    [{'tag': 'Black Panther', 'region': 'US', 'cat...
207147    [{'tag': 'call of duty', 'region': 'US', 'cate...
Length: 207148, dtype: object

In [52]:
tagDict = [tagDict for subList in df_tag.tolist() for tagDict in subList] 
pd.DataFrame(tagDict).to_csv("tags.csv", index=False)

In [53]:
df_tags = pd.read_csv("tags.csv")
df_tags

Unnamed: 0,tag,region,category
0,#DramaAlert,CA,News & Politics
1,Drama,CA,News & Politics
2,Alert,CA,News & Politics
3,DramaAlert,CA,News & Politics
4,keemstar,CA,News & Politics
...,...,...,...
3445696,ending explained,US,Film & Animation
3445697,call of duty,US,Gaming
3445698,cod,US,Gaming
3445699,activision,US,Gaming


In [54]:
set(df_tags['region'])

{'CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US'}