In [1]:
import subprocess

import bz2, os, sys, glob
import json, datetime
import pickle
from joblib import Parallel, delayed

In [2]:
segfit_path = "./segfit"
engage_dataset_base = "/localdata/u6314203/dataset_engage16/tweeted_videos"
output_base = "/localdata/u6314203/segfit_res_engage16"

categories_eligible = ["autos", "comedy","education","entertainment", \
                       "film","gaming","howto","music",\
                       "news","people","science","sports",\
                       "travel"]

In [3]:
def engage_dataset(test_category, eval_days = [90, 135, 180, 225, 270]):
    # read days info in engage16
    with open(os.path.join(engage_dataset_base, test_category+".json"), "r") as f:
        dataset_json = f.readlines()

    # dict-like dataset
    dataset = {}
    # remove duplication
    vids_set = set()

    for line in dataset_json:
        record = json.loads(line)
        try:        
            day = [int(x) for x in record['insights']['days'].split(",")]
            day_zero = record['insights']['startDate']
            views = [x for x in record['insights']['dailyView'].split(",")]

            """!!!"""
            if len(day) < eval_days[-1]: continue
        except:
            continue
        dataset[record['id']] = (day_zero, day, views)

        if record['id'] in vids_set:
            continue
        vids_set.add(record['id'])
        
    return dataset

In [4]:
def write_segfit_result(views_str, length_str, index, res):
    # if sequential: 6 seconds/video
    cp = subprocess.run([segfit_path,"-s", views_str, "-l", length_str], 
                        universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    phases = [line.split("\t") for line in cp.stdout.strip("\n").split("\n")]
    
    res['res_segfit'][index] = cp.stdout    
    res['end_first_phase'][index] = int(phases[0][1])
    res['num_phases'][index] = len(phases)

In [None]:
def get_segfit_input(dataset, vids):
    sequences = []
    ls = []    
    
    for vid in vids:
        values = dataset[vid]
        views_list = values[2]
        views_str = ",".join(views_list)
        length = len(views_list)
        sequences.append(views_str)
        ls.append(str(length))
        
    return sequences, ls

In [5]:
cat_num_samples = 2000
njobs = 20

for category in categories_eligible[:2]:
    dataset_partial = engage_dataset(category)
    
    vids = list(dataset_partial.keys())[:cat_num_samples]
    sequences, ls = get_segfit_input(dataset_partial, vids)
        
    res = {
        'vid': vids,
        'end_first_phase': [None]*cat_num_samples,
        'num_phases': [None]*cat_num_samples, 
        'res_segfit': [None]*cat_num_samples,
    }

    t1 = datetime.datetime.now()
    Parallel(n_jobs=njobs, backend="threading")(delayed(write_segfit_result)(sequences[x], ls[x], x, res) for x in range(cat_num_samples))
    t2 = datetime.datetime.now()
    print("Total time({}):{} seconds".format(category, (t2-t1).total_seconds()))    
    
    outfile = os.path.join(output_base, "{}.json".format(category))
    with open(outfile, "w") as f:
        json.dump(res, f)

Total time(autos):1417.817811 seconds
Total time(comedy):1384.264091 seconds


In [12]:
print(res_all_cats['autos']['vid'])
print(res_all_cats['autos']['end_first_phase'])
print(res_all_cats['autos']['num_phases'])
print(res_all_cats['autos']['res_segfit'])

['nw6iUh-MxBk', 'LUu6lxQy36w', '8iIKb8oYT5k', '5PF88VYGBBg', 'kdSNY2_gYuk', 'KO8W2IVfkSw', 'DPcM-ABFs4Y', '0FCfYjYQR8g', 'gkPFdbYsmB4', 'Ug4xmp8PuSs', '8O9nmC1YbXM', 'D-DJrrMbLC4', 'VHG-6SunwgM', '3rKLiaT_s8Q', 'kxSZa3PM9Tk', 'Ds0SsyJWAqA', 'EEPhAkN5_E0', 'MRtLuUWTyhc', 'WSv6tuXsaOI', 'hBw8_hHSW-I']
[3, 4, 273, 7, 287, 20, 277, 302, 285, 304, 3, 117, 25, 286, 38, 9, 3, 34, 281, 271]
[2, 5, 1, 4, 1, 8, 1, 1, 1, 1, 2, 3, 4, 1, 3, 2, 2, 8, 1, 1]
['0\t3\t-23.7617\t-6\t83.4374\t1\n4\t284\t100.068\t-1.28551\t-0.285318\t0\n', '0\t4\t52.6905\t-6\t47.2795\t0\n5\t18\t6.32224e-06\t6\t33.623\t1\n19\t44\t-0.17149\t1.72092\t66.5593\t0\n45\t170\t-1.14795e-07\t3.89074\t32.354\t0\n171\t303\t0.00478004\t1.74676\t7.67715\t1\n', '0\t273\t98.4606\t-1.66298\t1.97595\t0\n', '0\t7\t76.7054\t-2.11949\t5.15632\t0\n8\t17\t73.334\t-1.96977\t27.3855\t0\n18\t22\t0.265148\t3.17322\t18.5496\t1\n23\t296\t55.1285\t-3.20388\t1.31521\t0\n', '0\t287\t96.1963\t-1.2455\t0.831191\t0\n', '0\t20\t0.866648\t1.32242\t8.8567\t0\n

In [13]:
print(res_all_cats['comedy']['vid'])
print(res_all_cats['comedy']['end_first_phase'])
print(res_all_cats['comedy']['num_phases'])
print(res_all_cats['comedy']['res_segfit'])

['uElpUPDGi9E', 'm5FPWgloFpo', 'e-3wUlcaTZA', '2yo0QImaHN0', 'J8AKJrWJqgM', 'KnjCURfP3wE', 'I_gfHPHeDgw', 'KsPNsE_VYV8', 'AhQxU8-aFTY', 'srHbB3SdIgw', '1EnB1A25nDA', 'ntB5M11SxB8', '5_q_K8aweo4', 'G7X3QWOlRas', '93AikFYlcZI', '4bHq5EuKh8Q', 'nu9MhUoZV-k', 'ByVHEdvamy0', 'o1GDjFf_W3k', 'WITKr-qTHeE']
[271, 6, 296, 118, 2, 303, 286, 19, 273, 2, 290, 301, 31, 26, 276, 277, 16, 272, 292, 271]
[1, 3, 1, 2, 4, 1, 1, 3, 1, 4, 1, 1, 3, 3, 1, 1, 2, 1, 1, 1]
['0\t271\t99.941\t-1.55237\t0.406964\t0\n', '0\t6\t102.783\t-1.43177\t-3.35362\t1\n7\t18\t2.28151\t1.30833\t5.82766\t1\n19\t286\t10.429\t-0.80499\t0.0522077\t0\n', '0\t296\t99.4817\t-1.82751\t0.463157\t0\n', '0\t118\t94.7201\t-1.07998\t2.38434\t0\n119\t320\t-1.37845e-13\t6\t11.1201\t0\n', '0\t2\t-39.1439\t-6\t91.2368\t1\n3\t9\t-0.000194275\t6\t38.7386\t1\n10\t14\t20.6124\t-6\t74.8181\t0\n15\t276\t83.5457\t-0.578984\t-3.81343\t0\n', '0\t303\t113.006\t-0.810838\t-0.847306\t0\n', '0\t286\t99.0284\t-1.56529\t0.648779\t0\n', '0\t19\t6.74287e-06\t

##### Backup

In [3]:
# # experiment
# example = "70,11,15,7,7,8,11,4,7,2,2,1,6,6,3,2,2,2,1,1,4,1,1,1,1,2,3,1,1,1,1,3,1,2,2,1,1,1,1"
# length = len(example.split(","))
# cp = subprocess.run([segfit_path,"-s", example, "-l", str(length)], 
#                     universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# print(cp.stdout)
# print([line.split("\t") for line in cp.stdout.strip("\n").split("\n")])

In [20]:
print("Parallel Total time:73.261049 seconds")
print(res['vid'][:100])
print(res['end_first_phase'][:100])
print(res['res_segfit'][:100])

Parallel Total time:73.261049 seconds
['nw6iUh-MxBk', 'LUu6lxQy36w', '8iIKb8oYT5k', '5PF88VYGBBg', 'kdSNY2_gYuk', 'KO8W2IVfkSw', 'DPcM-ABFs4Y', '0FCfYjYQR8g', 'gkPFdbYsmB4', 'Ug4xmp8PuSs', '8O9nmC1YbXM', 'D-DJrrMbLC4', 'VHG-6SunwgM', '3rKLiaT_s8Q', 'kxSZa3PM9Tk', 'Ds0SsyJWAqA', 'EEPhAkN5_E0', 'MRtLuUWTyhc', 'WSv6tuXsaOI', 'hBw8_hHSW-I', 'FDb5TMHiR_o', 'TpVAyIH1Yk8', 'FjbaxkR_Jrc', 'ZPps-EvMBoI', 'PwPLk0sg5CY', 'Az-BnZ7U7p4', '3jj7IbiE96g', 'iqIYO6Yxq1Q', 'ZT3CdNgA37M', 'NizwJ9Mi450', 'ZDXEYpeJ5UY', 'U0J-bwufGak', '0zbiUov78y0', '3YOQ8PX4vT8', 'xO3ECEzRAZM', 'ANMlnqM_psE', 'HkLCORjlfyI', 'dSmQtpIML_o', 'zevErMEjYpQ', 'tkwkZrSsvj0', 'pROpGaJul8g', '_2dRWmVvrDg', 'KhHLCc33614', '8XR1NNblhZo', 'v8Wi7R37fa4', '6HXbDEwzRKo', 'JoxVEz-iyA4', 'MNdKf3NT6P8', 'wlXJ4uWCIo4', 'UkuJ6awlgDU', 'f4OqM5C_5ck', '1Y1Z4Ep1tKg', 'Ufz-cdfEm5Q', 'G9UQC8CdkOI', 'XmKZNITfNXw', 'XkmhCg2cTYo', 'usw1pBsXP3I', 'zLKwUD9BGLM', '9oCe0_wTnFA', 'zX4cLgb4t4U', 'CgG4OSgfoJU', 'S2S9luhDYGs', 'clxSFVZkscg', 'Vzq2_wl3zrY', '

In [13]:
print("Sequential Total time:657.274948 seconds")
print(res['vid'][:100])
print(res['end_first_phase'][:100])
print(res['res_segfit'][:100])

Total time:657.274948 seconds
['nw6iUh-MxBk', 'LUu6lxQy36w', '8iIKb8oYT5k', '5PF88VYGBBg', 'kdSNY2_gYuk', 'KO8W2IVfkSw', 'DPcM-ABFs4Y', '0FCfYjYQR8g', 'gkPFdbYsmB4', 'Ug4xmp8PuSs', '8O9nmC1YbXM', 'D-DJrrMbLC4', 'VHG-6SunwgM', '3rKLiaT_s8Q', 'kxSZa3PM9Tk', 'Ds0SsyJWAqA', 'EEPhAkN5_E0', 'MRtLuUWTyhc', 'WSv6tuXsaOI', 'hBw8_hHSW-I', 'FDb5TMHiR_o', 'TpVAyIH1Yk8', 'FjbaxkR_Jrc', 'ZPps-EvMBoI', 'PwPLk0sg5CY', 'Az-BnZ7U7p4', '3jj7IbiE96g', 'iqIYO6Yxq1Q', 'ZT3CdNgA37M', 'NizwJ9Mi450', 'ZDXEYpeJ5UY', 'U0J-bwufGak', '0zbiUov78y0', '3YOQ8PX4vT8', 'xO3ECEzRAZM', 'ANMlnqM_psE', 'HkLCORjlfyI', 'dSmQtpIML_o', 'zevErMEjYpQ', 'tkwkZrSsvj0', 'pROpGaJul8g', '_2dRWmVvrDg', 'KhHLCc33614', '8XR1NNblhZo', 'v8Wi7R37fa4', '6HXbDEwzRKo', 'JoxVEz-iyA4', 'MNdKf3NT6P8', 'wlXJ4uWCIo4', 'UkuJ6awlgDU', 'f4OqM5C_5ck', '1Y1Z4Ep1tKg', 'Ufz-cdfEm5Q', 'G9UQC8CdkOI', 'XmKZNITfNXw', 'XkmhCg2cTYo', 'usw1pBsXP3I', 'zLKwUD9BGLM', '9oCe0_wTnFA', 'zX4cLgb4t4U', 'CgG4OSgfoJU', 'S2S9luhDYGs', 'clxSFVZkscg', 'Vzq2_wl3zrY', 'xBKav9a9