In [1]:
from __future__ import print_function, division
import os, bz2, json, time
from datetime import timedelta

import math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt


import sys
sys.path.insert(0, '../')
from pyhip import HIP

In [21]:
dataset_path = "../data/active-dataset.json.bz2"
output_path = "../output/active_res.json"

In [2]:
attributes = ["YoutubeID","numTweet","numShare","numSubscriber","watchTime","dailyViewcount",\
              "description","title","channelId","channelTitle","category","uploadDate","duration",\
              "definition","dimension","caption","regionRestriction.blocked",\
              "regionRestriction.allowed","topicIds","relevantTopicIds","totalShare",\
              "totalViewcount","totalTweet","dailyTweets"]
# load ACTIVE from json
with bz2.BZ2File(dataset_path) as f:
    dataset = json.loads(f.readline())

In [3]:
# prepare dataset for HIP train and test
active_videos = {}
for video in dataset:
    active_videos[video['YoutubeID']] = (video['numShare'], video['dailyViewcount'], video['watchTime'],\
                                         video['category'])
print("Dataset size:", len(active_videos))

Dataset size: 14041


In [4]:
# define train-test params
num_train = 90
num_test = 30
num_initialization = 25
size = 30

In [26]:
ress = {}
vids = list(active_videos.keys())
for test_vid in vids[:size]:
    daily_share, daily_view, daily_watch, cate = active_videos[test_vid]
    hip_model = HIP()
    hip_model.initial(daily_share, daily_view, num_train, num_test, num_initialization)
    hip_model.fit_with_bfgs()

    hip_params = hip_model.get_parameters()
    hip_predict = hip_model.predict(hip_model.get_parameters_abbr(), hip_model.x[:num_train+num_test])
    res = {
        "category":cate,
        "hipParams":hip_params.tolist(), 
        "predictViewcount":hip_predict.tolist(),
        "dailyViewcount":hip_model.y[:num_train+num_test],
    }
    ress[test_vid] = res

--- Finish initialization set 5...
--- Finish initialization set 10...
--- Finish initialization set 15...
--- Finish initialization set 20...
--- Finish initialization set 25...
--- Model fitting RMSE: 14209.49
--- Model forecast RMSE: 417.63
--- Finish initialization set 5...
--- Finish initialization set 10...
--- Finish initialization set 15...
--- Finish initialization set 20...
--- Finish initialization set 25...
--- Model fitting RMSE: 444.82
--- Model forecast RMSE: 229.09
--- Finish initialization set 5...
--- Finish initialization set 10...
--- Finish initialization set 15...
--- Finish initialization set 20...
--- Finish initialization set 25...
--- Model fitting RMSE: 1564.95
--- Model forecast RMSE: 1068.85
--- Finish initialization set 5...
--- Finish initialization set 10...
--- Finish initialization set 15...
--- Finish initialization set 20...
--- Finish initialization set 25...
--- Model fitting RMSE: 957.68
--- Model forecast RMSE: 1176.44
--- Finish initialization s

In [27]:
with open(output_path, "w") as outfile:
    json.dump(ress, outfile)

In [31]:
with open(output_path, "r") as infile:
    mini_dataset = json.loads(infile.readline())
print("Size:", len(mini_dataset))
print(list(mini_dataset.keys())[0]+":", mini_dataset[list(mini_dataset.keys())[0]])

Size: 30
00-6OyXVA0M: {'category': 'Entertainment', 'hipParams': [436.9281938418154, 18.844440422514428, 0.6628811595853944, 0.12189603874566941, 0.0, 0.0, 1.0725384479956808, 468.6222869086567], 'predictViewcount': [110979.76123582112, 618768.356830546, 257254.33491369078, 123135.45332787723, 84790.42072653436, 100111.08568470608, 169308.0469778105, 329971.3633720906, 155579.84971915412, 77809.23275140418, 38032.063802530974, 32283.329101874246, 28399.09488346232, 21145.54293278832, 20654.967872073397, 20621.78758582844, 18434.902450878588, 11733.075520820365, 11716.743441887438, 13026.421988828066, 7434.932003282392, 7930.623491971278, 8401.075211599475, 6685.180263266557, 7442.986875702161, 7057.310493052216, 3972.729130053046, 5511.824568924665, 4305.131840757392, 728.0952512042769, 1360.0282568410535, 2276.6229321559294, 3649.3990332997973, 1557.6022333884562, 3163.842332843857, 3272.4755485692267, 1532.1102340391649, 1851.3335874774584, 1872.923010061506, 563.5986395888633, 1348.

In [45]:
def pop_perc_at(days, viewcounts, vids, num_train):
    """ compute popularity percentile for a series at several evaluation days
    :param days: list, time points to chop and compute popularity percentile 
    :param viewcounts: list, viewcount history for multiple videos
    :return: list of dataframe
    """
    viewcount_eval = []
    for i in range(len(viewcounts)):
        viewcount_accu = [np.sum(viewcounts[i][:num_train+day]) for day in days]
        viewcount_eval.append(viewcount_accu)
        
    df_viewcount_eval = pd.DataFrame(viewcount_eval, columns =days, index = vids)
    
    pop_perc_col = np.linspace(0, 1, len(df_viewcount_eval[days[0]]))
    pop_percs = []
    for day in days:
        df_viewcount_sort = df_viewcount_eval.loc[:,[day]].sort_values(by=[day], inplace=False)
        df_viewcount_sort[day] = pop_perc_col
        pop_percs.append(df_viewcount_sort)
    return pop_percs

In [49]:
vids = list(mini_dataset.keys())
viewcounts_pred = [mini_dataset[vid]["predictViewcount"] for vid in vids]
viewcounts_gt = [mini_dataset[vid]["dailyViewcount"] for vid in vids]
pop_perc_at([15, 30], viewcounts_pred, vids, num_train)[0].head()

Unnamed: 0,15
00RvR7OFZ7U,0.0
024ZHpwVQwo,0.034483
02iQhtfH6xU,0.068966
022iSZeLZLk,0.103448
00Jyvij2QOE,0.137931
