In [39]:
# visualize and analyze popularity
import json
import datetime
import dateutil
import pytz
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import csv
from datetime import timedelta
import dateutil.parser
from collections import defaultdict
from matplotlib.pyplot import figure
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import pandas as pd
import scipy.stats as stats
from statistics import mean
from bs4 import BeautifulSoup
import gensim


youtuber_list_path="subscriber_network.json"

def get_status(sub):
    if sub>=1000000:
        return "celebrity"
    elif 10000<=sub<1000000:
        return "professional"
    else:
        return "amateur"

def get_like_ratio(youtuber):
    v=youtuber["video_list"]
    like_sum=sum([v["like_count"] for v in video_list])
    dislike_sum=sum([v["dislike_count"] for v in video_list])
    if like_sum+dislike_sum==0:
        return 0
    else:
        return like_sum/(like_sum+dislike_sum)

def get_out_degree(youtuber):
    following_list=set()
    following_list.update(youtuber["network_subscriptions"])
    following_list.update(youtuber["network_tag_mentioning"])
    following_list.update(youtuber["network_caption_mentioning"])
    following_list.update(youtuber["network_description_mentioning"])
    return len(following_list)

def get_in_degree(youtuber):
    follower_list=set()
    follower_list.update(youtuber["network_subscribers"])
    follower_list.update(youtuber["network_tag_mentioners"])
    follower_list.update(youtuber["network_caption_mentioners"])
    follower_list.update(youtuber["network_description_mentioners"])
    return len(follower_list)


def get_video_length(youtuber):
    return mean([v["video_length"] for v in youtuber["video_list"]])

def get_publish_date(youtuber):
    start=datetime.datetime(year=2019, month=10, day=25, hour=0, minute=0, second=0, tzinfo=pytz.UTC)
    time=[(dateutil.parser.parse(v["publish_at"])-start).total_seconds()/86400 for v in youtuber["video_list"]]
    return mean(time)



def get_comment_awareness(youtuber):  

    comment_count=0
    for vi in youtuber["video_list"]:
        for comment in vi["comment_list"]:
            cleantext = BeautifulSoup(comment["text"],"html.parser").text

        print("video: {}, count:{}".format(vi["title"], counter))

def check_word(comment):
    global_warming=['global warming', 'climate change', 'ice', 'years', 'snow', 'arctic', 'scientist', 
                'sea', 'cause', 'ow', 'time', 'show', 'report', 'science', 'data', 'world', 
                'earth', 'environment', 'coverage', 'percent', 'human', 'study', 'satellite', 
                'ipcc', 'epa', 'expert', 'stop', 'fight', 'million', 'people']

    climate_change=['climate change', 'global warming', 'ow', 'report', 'ipcc', 'world', 
                    'science', 'environment', 'scientist', 'help', 'action', 'impacts', 'arctic', 
                    'time', 'australia', 'study', 'caused', 'talk', 'human', 'need', 'people', 'deniers', 
                    'huff', 'risk', 'fight', 'years', 'make', 'politics', 'nations', 'carbon']
    if global_warming[0] in cleantext or global_warming[1] in cleantext:
        return True
    words=gensim.utils.simple_preprocess(str(cleantext), deacc=True)
    for g in global_warming:
        if g in words:
            return True
    for g in climate_change:
        if g in words:
            return True
    return False
            
with open(youtuber_list_path,'r') as fp:
    youtubers=json.loads(fp.read())["nodes"]
    for you in youtubers:
        you["social_status"]=get_status(you["subscriber_count"])
celebrity=[d for d in youtubers if d["social_status"]=="celebrity"]
professional=[d for d in youtubers if d["social_status"]=="professional"]
amateur=[d for d in youtubers if d["social_status"]=="amateur"]

print("c:{}, p:{}, a:{}".format(len(celebrity), len(professional), len(amateur)))
# # figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
# # plt.plot(x,y)
# # plt.show()

data = {
    "social_status":[d["social_status"] for d in youtubers],
    "view_count":[sum([v["view_count"] for v in d["video_list"]]) for d in youtubers],
    "like_ratio": [get_like_ratio(d) for d in youtubers],
    "following_count": [get_out_degree(d) for d in youtubers],
    "follower_count": [get_in_degree(d) for d in youtubers],
    "video_count": [len(d["video_list"]) for d in youtubers],
    "video_length": [get_video_length(d) for d in youtubers],
    "release_date":[get_publish_date(d) for d in youtubers],
}

df = pd.DataFrame(data,columns=['social_status','view_count', "like_ratio", "following_count", "follower_count","video_count","video_length","release_date"]) 
print(df)


results = ols('video_count ~ C(social_status)', data=df).fit()
print(results.summary())
table = sm.stats.anova_lm(results, typ=2)
print(table)
df.groupby(["social_status"])["video_count"].mean()

c:75, p:204, a:194
    social_status  view_count  like_ratio  following_count  follower_count  \
0       celebrity    57217606    0.990721                1             233   
1       celebrity     7065392    0.989709                5              20   
2       celebrity     1630898    0.985566                4              12   
3       celebrity     1738194    0.995110                9              12   
4       celebrity     9512986    0.987144                1             140   
..            ...         ...         ...              ...             ...   
468       amateur          41    1.000000                1               0   
469       amateur        2260    0.901408                0               0   
470  professional         862    0.838710                1               0   
471       amateur         553    0.132353                1               0   
472  professional       32720    0.888231                2               0   

     video_count  video_length  release_date

social_status
amateur         1.108247
celebrity       1.040000
professional    1.102941
Name: video_count, dtype: float64