In [None]:
import json
from collections import OrderedDict, defaultdict, Counter
from dateutil import parser
from datetime import datetime
import pickle as pkl
from itertools import chain
from math import ceil

import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, linregress

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import MaxNLocator
from matplotlib.patches import Rectangle
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
import gzip
from functools import reduce

from tqdm import tqdm
import glob
import os
import subprocess

import socialsim as ss
register_matplotlib_converters()
sns.set()

In [None]:
cmap = plt.get_cmap('tab10')

In [None]:
# append = 'to_5_31'
# append = 'to_6_29'
# append = 'to_6_08'
# append = 'to_7_06'
# append = 'to_7_13'
# append = 'to_7_20'
# append = 'to_7_27'
append = 'to_8_03'

In [None]:
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-05-31'))
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-06-29'))
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-06-08'))
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-07-06'))
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-07-13'))
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-07-20'))
# idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-07-27'))
idx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-08-03'))

In [None]:
with open('/data/leidos_extracted/2021CP5/cp5_eval_nodes.txt', 'r') as f:
    evalNodes = f.read().split('\n')[:-2]
with open('/data/leidos_extracted/2021CP5/cp5_eval_nodes.txt', 'r') as f:
    nodes = f.read().split('\n')[:-2]
with open('/data/leidos_extracted/2021CP5/cp5_other_nodes.txt', 'r') as f:
    nodes += f.read().split('\n')[:-1]
nodes.append('empty')
nodes

# GDLET (full)

In [None]:
records = []
with open('/data/leidos/2021CP5/CPEC/Exogenous/GDELT/cp5-cpec.exogenous.gdelt.events.v1.json', 'r') as f:
    for l in f:
        records.append(json.loads(l))

In [None]:
alleventcodes = set()
for r in records:
    alleventcodes.add(r['EventCode'])
alleventcodes = sorted(alleventcodes)
df = pd.DataFrame(records)
df.day = pd.to_datetime(df.day)
df = df.sort_values('day')

In [None]:
eventCountTimeseries = {}
for code in alleventcodes:
    tmp = df.query('EventCode == "{}"'.format(code))
    counts = tmp.day.value_counts().resample('D').sum()
    idxx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-08-31'))
    counts = counts[pd.to_datetime('2020-03-30'):pd.to_datetime('2020-09-01')].reindex(idxx, fill_value=0)
    eventCountTimeseries[code] = counts

In [None]:
eventCountTimeseries_json = {k: v.to_json() for k, v in eventCountTimeseries.items()}

In [None]:
with open('/data/leidos_extracted/2021CP5/gdelt_time_series.json', 'w') as f:
    f.write(json.dumps(eventCountTimeseries_json))

# ACLED (full)

In [None]:
acled = pd.read_csv('/data/leidos/2021CP5/CPEC/Exogenous/ACLED/cp5-cpec.exogenous.training.acled.v1.csv')

In [None]:
acled.event_date = pd.to_datetime(acled.event_date)
acled = acled.sort_values('event_date')

In [None]:
acledCountTimeseries = {}
for code in acled.event_type.unique():
    tmp = acled.query('event_type == "{}"'.format(code))
    counts = tmp.event_date.value_counts().resample('D').sum()
    idxx = pd.date_range(pd.to_datetime('2020-03-30'), pd.to_datetime('2020-08-31'))
    counts = counts[pd.to_datetime('2020-03-30'):pd.to_datetime('2020-09-01')].reindex(idxx, fill_value=0)
    acledCountTimeseries['_'.join(code.split())] = counts

In [None]:
acledCountTimeseries_json = {k: v.to_json() for k, v in acledCountTimeseries.items()}

In [None]:
with open('/data/leidos_extracted/2021CP5/acled_time_series.json', 'w') as f:
    f.write(json.dumps(acledCountTimeseries_json))

# Twitter (partial)

#### infoID -> date -> (evt count, user set, unique user set)

In [None]:
tdict_count = defaultdict(lambda: defaultdict(int))
tdict_userset = defaultdict(lambda: defaultdict(set))
tdict_activateduserset = defaultdict(lambda: defaultdict(set))
t_userset = defaultdict(set)

In [None]:
datapath = '/data/leidos_extracted/2021CP5/cp5_twitter_2020-03-30_2020-06-29.json'
with open(datapath, 'rb') as f:
    for line in tqdm(f, total=int(subprocess.check_output(["wc", "-l", datapath]).decode("utf8").split()[0])):
        tmp = json.loads(line)
        date = pd.to_datetime(tmp['nodeTime']).date()
        user = tmp['nodeUserID']
        infoId = tmp['informationID']
        
        tdict_count[infoId][date] += 1
        tdict_userset[infoId][date].add(user)
        if user not in t_userset[infoId]:
            tdict_activateduserset[infoId][date].add(user)
        t_userset[infoId].add(user)

In [None]:
#INSERT TEMPLATE
# datapath = ''
# with open(datapath, 'rb') as f:
#     for line in tqdm(f, total=int(subprocess.check_output(["wc", "-l", datapath]).decode("utf8").split()[0])):
#         tmp = json.loads(line)
#         date = pd.to_datetime(tmp['nodeTime']).date()
#         user = tmp['nodeUserID']
#         infoId = tmp['informationID']
        
#         tdict_count[infoId][date] += 1
#         tdict_userset[infoId][date].add(user)
#         if user not in t_userset[infoId]:
#             tdict_activateduserset[infoId][date].add(user)
#         t_userset[infoId].add(user)

In [None]:
tdict_usercount = {k: {kk: len(vv) for kk, vv in v.items()} for k, v in tdict_userset.items()}
tdict_activateduser = {k: {kk: len(vv) for kk, vv in v.items()} for k, v in tdict_activateduserset.items()}

In [None]:
tmerge = defaultdict(dict)
for k, v in tdict_count.items():
    for kk, vv in v.items():
        if kk not in tdict_activateduser[k]:
            tdict_activateduser[k][kk] = 0
        tmerge[k][kk] = (vv, tdict_usercount[k][kk], tdict_activateduser[k][kk])

In [None]:
tmerge_df = {k: pd.DataFrame.from_dict(v, orient='index', columns=['EventCount', 'UserCount', 'NewUserCount']).reindex(idx, fill_value=0) for k, v in tmerge.items()}

In [None]:
set(tmerge.keys()) == set(nodes)

In [None]:
tmerge_df_json = {k: v.to_json() for k, v in tmerge_df.items()}

In [None]:
with open('/data/leidos_extracted/2021CP5/twitter_time_series_{}.json'.format(append), 'w') as f:
    f.write(json.dumps(tmerge_df_json))

In [None]:
tprob = defaultdict(lambda: defaultdict(lambda: [0, 0, 0, 0]))
maxes = {}
for node in nodes:
    tmp = max(tmerge_df[node].EventCount)
    maxes[node] = tmp
    for date in tmerge_df[node].index:
        cur = tmerge_df[node].loc[date].EventCount
        if cur < tmp / 4:
            for u in tdict_userset[node][date.date()]:
                tprob[node][u][0] += 1
        elif cur < tmp * 2 / 4:
            for u in tdict_userset[node][date.date()]:
                tprob[node][u][1] += 1
        elif cur < tmp * 3 / 4:
            for u in tdict_userset[node][date.date()]:
                tprob[node][u][2] += 1
        else:
            for u in tdict_userset[node][date.date()]:
                tprob[node][u][3] += 1

In [None]:
with open('/data/leidos_extracted/2021CP5/twitter_prob_{}.json'.format(append), 'w') as f:
    json.dump({'span': len(idx), 'max': maxes, 'prob': tprob}, f)

# YouTube (partial)

In [None]:
ydict_count = defaultdict(lambda: defaultdict(int))
ydict_userset = defaultdict(lambda: defaultdict(set))
ydict_activateduserset = defaultdict(lambda: defaultdict(set))
y_userset = defaultdict(set)

In [None]:
datapath = '/data/leidos_extracted/2021CP5/cp5_youtube_to_8_03.json'
with open(datapath, 'rb') as f:
    for line in tqdm(f, total=int(subprocess.check_output(["wc", "-l", datapath]).decode("utf8").split()[0])):
        tmp = json.loads(line)
        date = pd.to_datetime(tmp['nodeTime']).date()
        user = tmp['nodeUserID']
        infoId = tmp['informationID']
        
        ydict_count[infoId][date] += 1
        ydict_userset[infoId][date].add(user)
        if user not in y_userset[infoId]:
            ydict_activateduserset[infoId][date].add(user)
        y_userset[infoId].add(user)

In [None]:
#INSERT TEMPLATE
# datapath = ''
# with open(datapath, 'rb') as f:
#     for line in tqdm(f, total=int(subprocess.check_output(["wc", "-l", datapath]).decode("utf8").split()[0])):
#         tmp = json.loads(line)
#         date = pd.to_datetime(tmp['nodeTime']).date()
#         user = tmp['nodeUserID']
#         infoId = tmp['informationID']
        
#         ydict_count[infoId][date] += 1
#         ydict_userset[infoId][date].add(user)
#         if user not in t_userset[infoId]:
#             ydict_activateduserset[infoId][date].add(user)
#         y_userset[infoId].add(user)

In [None]:
ydict_usercount = {k: {kk: len(vv) for kk, vv in v.items()} for k, v in ydict_userset.items()}
ydict_activateduser = {k: {kk: len(vv) for kk, vv in v.items()} for k, v in ydict_activateduserset.items()}

In [None]:
ymerge = defaultdict(dict)
for k, v in ydict_count.items():
    for kk, vv in v.items():
        if kk not in ydict_activateduser[k]:
            ydict_activateduser[k][kk] = 0
        ymerge[k][kk] = (vv, ydict_usercount[k][kk], ydict_activateduser[k][kk])

In [None]:
ymerge_df = {k: pd.DataFrame.from_dict(v, orient='index', columns=['EventCount', 'UserCount', 'NewUserCount']).reindex(idx, fill_value=0) for k, v in ymerge.items()}

In [None]:
for k in set(nodes) - set(ymerge.keys()):
    print(k)
    ymerge_df[k] = pd.DataFrame(columns=['EventCount', 'UserCount', 'NewUserCount']).reindex(idx, fill_value=0)

In [None]:
ymerge_df_json = {k: v.to_json() for k, v in ymerge_df.items()}

In [None]:
with open('/data/leidos_extracted/2021CP5/youtube_time_series_{}.json'.format(append), 'w') as f:
    f.write(json.dumps(ymerge_df_json))

In [None]:
yprob = defaultdict(lambda: defaultdict(lambda: [0, 0, 0, 0]))
maxes = {}
for node in nodes:
    if node not in ymerge_df:
        maxes[node] = 0
        yprob[node] = {}
        continue
    tmp = max(ymerge_df[node].EventCount)
    maxes[node] = tmp
    for date in ymerge_df[node].index:
        cur = ymerge_df[node].loc[date].EventCount
        if cur < tmp / 4:
            for u in ydict_userset[node][date.date()]:
                yprob[node][u][0] += 1
        elif cur < tmp * 2 / 4:
            for u in ydict_userset[node][date.date()]:
                yprob[node][u][1] += 1
        elif cur < tmp * 3 / 4:
            for u in ydict_userset[node][date.date()]:
                yprob[node][u][2] += 1
        else:
            for u in ydict_userset[node][date.date()]:
                yprob[node][u][3] += 1

In [None]:
with open('/data/leidos_extracted/2021CP5/youtube_prob_{}.json'.format(append), 'w') as f:
    json.dump({'span': len(idx), 'max': maxes, 'prob': yprob}, f)

In [None]:
# with open('youtube_time_series.json', 'r') as f:
#     d = json.loads(f.read())

In [None]:
# dd = {k: pd.read_json(v, orient='columns') for k, v in d.items()}

In [None]:
fig, axs = plt.subplots(int(ceil(len(nodes) / 3)), 3, True, False, True, figsize=(20, 17), dpi=200)
for i in range(int(ceil(len(nodes) / 3))):
    for j in range(3):
        if i * 3 + j >= len(nodes):
            continue
        nar = nodes[i * 3 + j]
        axs[i][j].plot(tmerge_df[nar].EventCount, color=cmap(0), linewidth=2, label='Twitter')
        axs[i][j].set_ylabel('', color=cmap(0))
        axs[i][j].tick_params(axis='y', labelcolor=cmap(0))

        ax2 = axs[i][j].twinx()
        ax2.plot(ymerge_df[nar].EventCount, color=cmap(1), linewidth=2, label='YouTube')
        ax2.set_ylabel('', color=cmap(1))
        ax2.tick_params(axis='y', labelcolor=cmap(1))

        axs[i][j].set_title(nar)
handles, labels = axs[2, 2].get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
fig.legend(handles + handles2, labels + labels2, loc='center left', bbox_to_anchor=(1, 0.5))
fig.autofmt_xdate()
fig.suptitle("Time series", fontsize=30)
plt.tight_layout(rect=[0, 0.03, 1, 0.93])
plt.show()

In [None]:
with open('/data/leidos_extracted/2021CP5/gdelt_time_series.json', 'r') as f:
    eventCountTimeseries = {k: pd.read_json(v, typ='series') for k, v in json.loads(f.read()).items()}

In [None]:
allnar = sorted(set(tmerge_df.keys()) | set(ymerge_df.keys())) #sorted(nodes)#

In [None]:
ecmap = {k: i for i, k in enumerate(alleventcodes)}
narmap = {k: i for i, k in enumerate(nodes)}

In [None]:
tgmat = np.zeros((len(alleventcodes), len(nodes)))
ygmat = np.zeros((len(alleventcodes), len(nodes)))
for k1 in alleventcodes:
    for k2 in nodes:
        x = np.pad(np.array(eventCountTimeseries[k1].to_list()), 1, mode='constant')
        x = x / np.linalg.norm(x)
        y = np.array(tmerge_df[k2].EventCount.to_list())
        if np.linalg.norm(y) > 0:
            y = y / np.linalg.norm(y)
        z = None
        if k2 in ymerge_df:
            z = np.array(ymerge_df[k2].EventCount.to_list())
            if np.linalg.norm(z) > 0:
                z = z / np.linalg.norm(z)
        else:
            z = np.array(pd.Series().reindex(idx, fill_value=0).to_list())
        tgmat[ecmap[k1], narmap[k2]] = pearsonr(x[1:1+len(y)], y)[0]
        ygmat[ecmap[k1], narmap[k2]] = pearsonr(x[0:0+len(z)], z)[0]

In [None]:
tgmat[np.isnan(tgmat)] = -2
ygmat[np.isnan(ygmat)] = -2

In [None]:
corr = {
    'eventCodeMap': ecmap,
    'narrativeMap': narmap,
    'twitterGdeltMat': tgmat.tolist(),
    'youtubeGdeltMat': ygmat.tolist()
}

In [None]:
sns.heatmap(ygmat)

In [None]:
# np.array(corr['youtubeGdeltMat']) == -2

In [None]:
with open('/data/leidos_extracted/2021CP5/corrmat_{}.json'.format(append), 'w') as f:
    f.write(json.dumps(corr))

In [None]:
with open('/data/leidos_extracted/2020CP4/corrmat{}.json'.format('_to_2_14'), 'r') as f:
    d = json.loads(f.read())

In [None]:
eventCodeMap = d['eventCodeMap']
narrativeMap = d['narrativeMap']

In [None]:
twitterGdeltMat = np.array(corr['twitterGdeltMat'])
twitterGdeltMat[np.array(corr['twitterGdeltMat']) == -2] = np.nan

In [None]:
youtubeGdeltMat = np.array(corr['youtubeGdeltMat'])
youtubeGdeltMat[np.array(corr['youtubeGdeltMat']) == -2] = np.nan

In [None]:
sns.heatmap(youtubeGdeltMat)

In [None]:
narrativeMap