In [12]:
import json
import datetime
import dateutil
import pytz
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import isodate as isodate
from langdetect import detect

def filter_video(video):
    time = dateutil.parser.parse(video["video"]["snippet"]["publishedAt"])
    if time <= datetime.datetime(year=2019, month=10, day=24, hour=23, minute=59, second=59, tzinfo=pytz.UTC):
        return False
    if time >= datetime.datetime(year=2020, month=1, day=1, hour=0, minute=0, second=0, tzinfo=pytz.UTC):
        return False
    return True

def get_category(channel):
    result=[]
    if "topicDetails" in channel:
        cate_list=channel["topicDetails"]["topicCategories"]
        for cate in cate_list:
            result.append(cate.split('/')[-1])
    return result

def get_comment_list(comments):
    result=[]
    for com in comments:
        if com=="error":
            return []
        result.append(
        {
            "text":com["snippet"]["topLevelComment"]["snippet"]["textDisplay"],
            "like":com["snippet"]["topLevelComment"]["snippet"]["likeCount"]
         })
    return result

def get_caption_list(captions):
    caption=""
    for cap in captions:
        if cap == "error":
            return caption
        caption+=cap["text"]+" "
    caption= caption.replace("\n", ".")
    return caption

def get_duration(duration):
    dur=isodate.parse_duration(duration)
    return dur.total_seconds()

with open("KEY/US_CATE.json", 'r+') as fp:
    
US_CATE=

def get_video_category(channel, categoryId):
    categoreis=[]
    if "videoCategories" not in channel:
        categories=US_CATE
    else:
        categories=channel["videoCategories"]["items"]
    for cate in categories:
        if cate["id"]==categoryId:
            return cate["snippet"]["title"]
    for cate in US_CATE:
        if cate["id"]==categoryId:
            return cate["snippet"]["title"]

all_data_path="data.json"
youtuber_list = {}
with open(all_data_path, 'r') as fp:
    line = fp.readline()
    while line:
        data = json.loads(line)
        if filter_video(data):
            youtuber = {
                "channel_id": data["channel"]["id"],
                "channel_name": data["channel"]["snippet"]["localized"]["title"],
                "discription":data["channel"]["snippet"]["localized"]["description"],
                "country": data["channel"]["snippet"]["country"] if "country" in data["channel"]["snippet"] else "",
                "subscriber_count": int(data["channel"]["statistics"]["subscriberCount"]),
                "publish_at":data["channel"]["snippet"]["publishedAt"],
                "country": data["channel"]["snippet"]["country"] if "country" in data["channel"]["snippet"] else "unknown",
                "video_count": int(data["channel"]["statistics"]["videoCount"]),
                "view_count": int(data["channel"]["statistics"]["viewCount"]),
                "category": get_category(data["channel"]),
                "subscription":data["subscription"],
                "network_subscriptions":[],
                "network_tag_mentioning":[],
                "network_caption_mentioning":[],
                "network_description_mentioning":[],
                "network_subscribers":[],
                "network_tag_mentioners":[],
                "network_caption_mentioners":[],
                "network_description_mentioners":[],
                "video_list":[]
            }
            if youtuber["channel_id"] not in youtuber_list:
                youtuber_list[youtuber["channel_id"]]=youtuber
        line = fp.readline()        
# generate videoList    
with open(all_data_path, 'r') as fp:
    line = fp.readline()
    while line:
        data = json.loads(line)
        if filter_video(data):
            channel_id = data["channel"]["id"]
            video={
                "video_id":data["video"]["id"],
                "title":data["video"]["snippet"]["localized"]["title"],
                "video_length":get_duration(data["video"]["contentDetails"]["duration"]),
                "publish_at":data["video"]["snippet"]["publishedAt"],
                "category_id":data["video"]["snippet"]["categoryId"],
                "category":get_video_category(data["channel"], data["video"]["snippet"]["categoryId"]),
                "language":detect(data["video"]["snippet"]["localized"]["title"]),
                "live_content":data["video"]["snippet"]["liveBroadcastContent"],
                "description":data["video"]["snippet"]["localized"]["description"],
                "tags":data["video"]["snippet"]["tags"] if "tags" in data["video"]["snippet"] else [],
                "comment_count": int(data["video"]["statistics"]["commentCount"]) if "commentCount" in data["video"]["statistics"] else 0,
                "view_count":int(data["video"]["statistics"]["viewCount"]),
                "like_count":int(data["video"]["statistics"]["likeCount"]) if "likeCount" in data["video"]["statistics"] else 0,
                "dislike_count":int(data["video"]["statistics"]["dislikeCount"]) if "dislikeCount" in data["video"]["statistics"] else 0,
                "comment_list": get_comment_list(data["comments"]),
                "caption": get_caption_list(data["caption"])
            }
            youtuber_list[channel_id]["video_list"].append(video)
        line = fp.readline()
        
        
def get_subscriptions(subscriptions):
    following=[]
    if len(subscriptions)==0 or subscriptions[0]=="error":
        return following
    for sub in subscriptions:
        channel_id=sub["snippet"]["resourceId"]["channelId"]
        if channel_id in youtuber_list:           
            following.append(channel_id)
    return following

def tag_mentioning_list(you_channel, tag_list):
    following=[]
    for you in youtuber_list.values():
        if you["channel_name"].lower()==you_channel.lower():
            continue
        for tag in tag_list:
            if you["channel_name"].lower() in tag.lower() and you["channel_id"] not in following:
                following.append(you["channel_id"])
    return following

def caption_mentioning_list(you_channel, caption_list):
    following=[]
    for you in youtuber_list.values():
        if you["channel_name"].lower()==you_channel.lower():
            continue
        for caption in caption_list:
            if you["channel_name"].lower() in caption.lower() and you["channel_id"] not in following:
                following.append(you["channel_id"])
    return following

def description_mentioning_list(you_channel, description_list):
    following=[]
    for you in youtuber_list.values():
        if you["channel_name"].lower()==you_channel.lower():
            continue
        for description in description_list:
            if you["channel_name"].lower() in description.lower() and you["channel_id"] not in following:
                following.append(you["channel_id"])
    return following
# counter network subscriber

for you in youtuber_list.values():
    #add subscriptions:
    you["network_subscriptions"]=get_subscriptions(you["subscription"])
    del you["subscription"]
    #add tag mentioning
    tags_list=[v["tags"] for v in you["video_list"]]
    tags=[]
    for t in tags_list:
        tags+=t
    you["tags"]=tags
    you["network_tag_mentioning"]=tag_mentioning_list(you["channel_name"], tags)
    # add caption mentioning list
    captions=[v["caption"] for v in you["video_list"]]
    you["network_caption_mentioning"]=caption_mentioning_list(you["channel_name"],captions)
    # add description mentioning list
    descriptions=[v["description"] for v in you["video_list"]]
    you["network_description_mentioning"]=description_mentioning_list(you["channel_name"],descriptions)


for you in youtuber_list.values():
    for sub in you["network_subscriptions"]:
        if sub not in youtuber_list[sub]["network_subscribers"]:
            youtuber_list[sub]["network_subscribers"].append(you["channel_id"])
    for sub in you["network_tag_mentioning"]:
        if sub not in youtuber_list[sub]["network_tag_mentioners"]:
            youtuber_list[sub]["network_tag_mentioners"].append(you["channel_id"]) 
    for sub in you["network_caption_mentioning"]:      
        if sub not in youtuber_list[sub]["network_caption_mentioners"]:
            youtuber_list[sub]["network_caption_mentioners"].append(you["channel_id"])     
    for sub in you["network_description_mentioning"]:
        if sub not in youtuber_list[sub]["network_description_mentioners"]:
            youtuber_list[sub]["network_description_mentioners"].append(you["channel_id"])           
            
# temp=[(you["channel_name"],you["network_description_mentioners"]) for you in youtuber_list.values()]
# print(json.dumps(temp, indent=4))
            
subscribe_link=[]      
for y in youtuber_list.values():
    for s in y["network_subscribers"]:
        subscribe_link.append({
            "source": s,
            "target": y["channel_id"],
            "weight": 1
        })
    for s in y["network_tag_mentioners"]:
        subscribe_link.append({
            "source": s,
            "target": y["channel_id"],
            "weight": 1
        })   
    for s in y["network_caption_mentioners"]:
        subscribe_link.append({
            "source": s,
            "target": y["channel_id"],
            "weight": 1
        }) 
    for s in y["network_description_mentioners"]:
        subscribe_link.append({
            "source": s,
            "target": y["channel_id"],
            "weight": 1
        })  

nodes=list(youtuber_list.values())

data = {
    "nodes": nodes,
    "links": subscribe_link
}  
  
with open("subscriber_network.json", 'w+') as fp:
    fp.write(json.dumps(data))

print("youtuber count:{}".format(len(nodes)))
print("video count:{}".format(sum([len(d["video_list"]) for d in nodes])))     

youtuber count:473
video count:518
