In [None]:
import pandas as pd
import numpy as np
import json
import datetime
import time
from load import load_data
import random
import pickle
from tqdm import tqdm
from IPython.display import display
from collections import defaultdict
from joblib import Parallel, delayed
from math import ceil

with open('/data/leidos_extracted/2021CP5/cp5_eval_nodes.txt', 'r') as f:
    nodeList = set(sorted(set(f.read().split('\n')[:-2])))

# with open('/data/leidos_extracted/2021CP5/prev_user_to_6_29.pkl', 'rb') as infile:
#     user_tuple = pickle.load(infile)
#     user_tuple = user_tuple[user_tuple.nodeUserID.notnull()]

In [None]:
columns = ['nodeID', 'nodeUserID', 'parentID', 'rootID', 'actionType', 'nodeTime', 'platform', 'rootUserID', 'parentUserID', 'informationID']

# Load data

In [None]:
t0_data = load_data('/data/leidos_extracted/2021CP5/cp5_twitter_2020-03-30_2020-06-29.json', ignore_first_line=False, verbose=False)[columns]
t1_data = load_data('/data/leidos_extracted/2021CP5/cp5_twitter_2020-06-30_2020-07-06.json', ignore_first_line=False, verbose=False)[columns]
y0_data = load_data('/data/leidos_extracted/2021CP5/cp5_youtube_to_7_06.json', ignore_first_line=False, verbose=False)[columns]

In [None]:
twitter_history = pd.concat([
    t0_data, t1_data
], ignore_index=True).sort_values(['nodeTime', 'nodeID']).reset_index(drop=True)
youtube_history = pd.concat([
    y0_data
], ignore_index=True).sort_values(['nodeTime', 'nodeID']).reset_index(drop=True)

In [None]:
twitter_history = twitter_history[~twitter_history.nodeUserID.isnull()]
twitter_history.loc[twitter_history.parentUserID.isnull(), 'parentUserID'] = '?'
twitter_history.loc[twitter_history.rootUserID.isnull(), 'rootUserID'] = '?'
youtube_history = youtube_history[~youtube_history.nodeUserID.isnull()]
youtube_history.loc[youtube_history.parentUserID.isnull(), 'parentUserID'] = '?'
youtube_history.loc[youtube_history.rootUserID.isnull(), 'rootUserID'] = '?'

In [None]:
twitter_history = twitter_history[(twitter_history.nodeTime < pd.to_datetime('2020-06-09'))]
youtube_history = youtube_history[(youtube_history.nodeTime < pd.to_datetime('2020-06-09'))]

In [None]:
append = 'to_7_06'

In [None]:
with open('/data/leidos_extracted/2021CP5/twitter_prob_{}.json'.format(append), 'r') as f:
    twitter_prob = json.load(f)

In [None]:
with open('/data/leidos_extracted/2021CP5/youtube_prob_{}.json'.format(append), 'r') as f:
    youtube_prob = json.load(f)

# Algorithm

In [None]:
idx = pd.date_range(pd.to_datetime('2020-06-09'), pd.to_datetime('2020-07-06'))

In [None]:
def networkFilling(history, pred_path, prob):
    with open(pred_path, 'r') as f:
        d = json.loads(f.read())
    data = {k: pd.read_json(v, orient='columns').reindex(idx, fill_value=0) for k, v in d.items()}
    
#     for infoID in nodeList:
    def inner(infoID):
        content = data[infoID]
        totalEvent = sum(content.EventCount)
        if totalEvent == 0:
            return pd.DataFrame(columns=columns)
        
        duplicate = history[history.informationID == infoID]
        if len(duplicate) == 0:
            duplicate = history[history.informationID == 'controversies/china/funding'].copy() # china/naval in youtube has no history
            duplicate.loc[:, 'informationID'] = 'controversies/china/naval'
            active_level = pd.DataFrame(prob['prob']['controversies/china/funding']).T.reset_index().replace(0, np.inf) # china/naval in youtube has no history
        else:
            active_level = pd.DataFrame(prob['prob'][infoID]).T.reset_index().replace(0, np.inf)
            
        print(infoID, len(duplicate), int(ceil(totalEvent / len(duplicate))))
        def mod(df, cnt):
            tmp = df.copy()
            tmp.nodeID = tmp.nodeID.apply(lambda x: x + '_' + str(cnt))
            tmp.parentID = tmp.parentID.apply(lambda x: x if x == '?' else x + '_' + str(cnt))
            tmp.rootID = tmp.rootID.apply(lambda x: x if x == '?' else x + '_' + str(cnt))
            tmp.nodeUserID = tmp.nodeUserID.apply(lambda x: x + '_' + str(cnt))
            tmp.parentUserID = tmp.parentUserID.apply(lambda x: x if x == '?' else x + '_' + str(cnt))
            tmp.rootUserID = tmp.rootUserID.apply(lambda x: x if x == '?' else x + '_' + str(cnt))
            return tmp
        copies = [mod(duplicate, i) for i in range(1, int(ceil(totalEvent / len(duplicate))))]
        duplicate = pd.concat(list(reversed(copies)) + [duplicate], axis=0)
        display(totalEvent)
        duplicate = duplicate.iloc[-totalEvent:]

        counter = 0
        maxEvent = prob['max'][infoID]
        sortCols = {}
        for index, (timestamp, (volume, _, newUser)) in enumerate(content.iterrows()):
            if volume < maxEvent / 4:
                sortCols[timestamp] = 'c0'
            elif volume < maxEvent * 2 / 4:
                sortCols[timestamp] = 'c1'
            elif volume < maxEvent * 3 / 4:
                sortCols[timestamp] = 'c2'
            else:
                sortCols[timestamp] = 'c3'
                
            print (index, infoID, timestamp, volume, sortCols[timestamp])
            if volume == 0: continue

            # NodeTime Shift
            duplicate.nodeTime.iloc[counter: counter + volume] = list(pd.Series(pd.date_range(timestamp + pd.Timedelta(seconds=1), timestamp + pd.Timedelta(hours=23, minutes=59, seconds=59), periods=volume)).astype('datetime64[s]'))
            # NodeID Shift
            duplicate.nodeID.iloc[counter: counter + volume] = duplicate.nodeID.iloc[counter: counter + volume].apply(lambda x: x + '_')
            duplicate.parentID.iloc[counter: counter + volume] = duplicate.parentID.iloc[counter: counter + volume].apply(lambda x: x if x == '?' else x + '_')
            duplicate.rootID.iloc[counter: counter + volume] = duplicate.rootID.iloc[counter: counter + volume].apply(lambda x: x if x == '?' else x + '_')

            counter += volume
            
        involved_users = []
        grouped_data = duplicate[['nodeID', 'nodeUserID', 'nodeTime']].set_index('nodeTime').groupby([pd.Grouper(freq='1D'), 'nodeUserID'])\
            .nodeID.nunique().to_frame().reset_index().sort_values(['nodeTime', 'nodeID'], ascending=[True, True])\
            .drop_duplicates('nodeUserID')
        grouped_data['_nodeUserID'] = grouped_data.nodeUserID.apply(lambda x: x[:22])
        active_level.columns = ['_nodeUserID', 'c0', 'c1', 'c2', 'c3']
        grouped_data = grouped_data.merge(active_level, how='left', on='_nodeUserID').set_index('nodeTime').groupby([pd.Grouper(freq='1D')])

        for name, group in grouped_data:
            group = group.sort_values(['nodeID', sortCols[name]])
            involved_users.extend(list(group.head(content.loc[name].NewUserCount).nodeUserID.values))
        involved_users = set(involved_users)
        duplicate.nodeUserID = duplicate.nodeUserID.apply(lambda x: x + '_' if x in involved_users else x)
        duplicate.parentUserID = duplicate.parentUserID.apply(lambda x: x if x == '?' else (x + '_' if x in involved_users else x))
        duplicate.rootUserID = duplicate.rootUserID.apply(lambda x: x if x == '?' else (x + '_' if x in involved_users else x))
        
        return duplicate
    final = Parallel(n_jobs=8, verbose=50)(delayed(inner)(infoID) for infoID in data.keys())
    final = pd.concat(final)
    return data, final

In [None]:
def check(data, df):
    for infoID in data.keys():
        if data[infoID].EventCount.sum() == 0:
            print('Pass', infoID)
            continue
        print(np.all(data[infoID].EventCount == df[df.informationID == infoID].nodeTime.value_counts().resample('D').sum().reindex(data['empty'].index, fill_value=0)), infoID)

In [None]:
models = {
    'UIUC_ML_LASSO_2_CORR_LEIDOS': '/home/michael/local_eval/'
}
output_path = '/home/dachun/cp5/local0706/'

In [None]:
with open(models['UIUC_ML_LASSO_2_CORR_LEIDOS'] + 'twitter_UIUC_ML_LASSO_2_CORR_LEIDOS.json', 'r') as f:
    d = json.loads(f.read())
data = {k: pd.read_json(v, orient='columns') for k, v in d.items()}

In [None]:
for model_name, model_path in models.items():
    print(model_name)
    ydata, ydf = networkFilling(youtube_history, model_path + 'youtube_{}.json'.format(model_name), youtube_prob)
    check(ydata, ydf)
    tdata, tdf = networkFilling(twitter_history, model_path + 'twitter_{}.json'.format(model_name), twitter_prob)
    check(tdata, tdf)
    final = pd.concat([tdf, ydf], axis=0).sort_values(by='nodeTime').reset_index(drop=True)
    final.loc[~final.informationID.isin(nodeList), 'informationID'] = 'other'
    final.nodeTime = final.nodeTime.apply(lambda x: x.strftime('%Y-%m-%dT%H:%M:%SZ'))

    identifier = {"team": "uiuc", "model_identifier": model_name, "simulation_period": "june9-july6"}

    with open(output_path + model_name + '.json', 'w') as f:
        f.write(json.dumps(identifier) + '\n')
        for row in tqdm(final.iterrows(), total=final.shape[0]):
            f.write(row[1].to_json() + '\n')