In [None]:
import pickle

import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import beta
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import rankdata, kendalltau, spearmanr

import plotly.graph_objects as go
from tqdm.notebook import tqdm
from pathlib import Path

sns.set_theme("paper", "white")

In [None]:
df_users = pd.read_csv("./DSock/Active_Users.csv", index_col=0)
display(df_users.head())

timeline = pd.date_range("2020-09-03-03:59:59", periods=6, tz="utc")

In [None]:
topics = list("abcdefgh")
nums = list(range(9))
num2topic = dict(zip(nums, topics))
topic2num = dict(zip(topics, nums))
print(f"{topic_num} {num_topic}")
deltas = ["10", "21", "32", "43", "54"]

In [None]:
df_evals = pd.read_csv("./DSock/human_eval_analysis/attitude_tracking.csv")
df_evals = pd.merge(df_users, df_evals, how="left", left_on="MTurk_ID", right_on="id").rename({"id_x": "id"}, axis=1)
df_evals.index = [f"u{u}" for u in df_evals["id"]]

display(df_evals.head())

get_change = lambda u, t: df_evals.loc[u][[f"{t}_delta_{d}" for d in deltas]].tolist()
print(get_change("u2337", num2topic[0]))

In [None]:
df_posts = pd.read_csv("./DSock/direct_influence/posts_with_senti_and_topic.csv", parse_dates=["createdAt"])
df_comts = pd.read_csv("./DSock/direct_influence/comments_with_senti_and_topic.csv", delimiter=",", parse_dates=["createdAt"]).dropna(subset=["CommenterId", "PostId"])
df_views = pd.read_csv("./DSock/postViews.csv", delimiter="|", parse_dates=["createdAt"]).dropna(subset=["UserId", "PostId"])
df_likes = pd.read_csv("./DSock/Likes.csv", delimiter="|", parse_dates=["createdAt"]).dropna(subset=["UserId", "PostId"])
df_repts = pd.read_csv("./DSock/ReportUsers.csv", delimiter="|", parse_dates=["createdAt"])

print(f"posts: {df_posts.shape}, comments: {df_comts.shape}, views: {df_views.shape}, likes: {df_likes.shape}, reports: {df_repts.shape}")

df_likes["UserId"] = df_likes["UserId"].astype(int)
df_likes["PostId"] = df_likes["PostId"].astype(int)

df_comts["CommenterId"] = df_comts["CommenterId"].astype(int)
df_comts["PostId"] = df_comts["PostId"].astype(int)

df_annot_posts = pd.read_csv("./DSock/posts_anno_labels.csv")
df_posts = df_posts.merge(df_annot_posts[["post_id", "majority_topic_label", "majority_sent_label"]], left_on="PostID", right_on="post_id")

df_annot_comts = pd.read_csv("./DSock/comments_anno_labels.csv")
df_comts = df_comts.merge(df_annot_comts[["comment_id", "majority_topic_label", "majority_sent_label"]], left_on="id", right_on="comment_id")

In [None]:
post_comments = {f"p{p}": [] for p in df_posts["PostID"].values}
for c, p in df_comts[["id", "PostId"]].values:
    if f"p{p}" in post_comments:
        post_comments[f"p{p}"].append(f"c{c}")
G = nx.DiGraph()
for u, uname, obsr, sock in df_users[["id", "username", "isObserver", "isPuppet"]].values:
    G.add_node(f"u{u}", id=u, kind="user", name=uname, observer=(obsr == "t"), sock=(sock == "t"))
    if obsr == "t":
        G.nodes[f"u{u}"]["color"] = "obsr"
    elif sock == "t":
        G.nodes[f"u{u}"]["color"] = "sock"
    else:
        G.nodes[f"u{u}"]["color"] = "part"

for p, u, t, polar, score, topic in df_posts[["PostID", "AuthorId", "createdAt", "majority_sent_label", "scores", "majority_topic_label"]].values:
    if f"u{u}" in G.nodes:
        G.add_node(f"p{p}", id=p, kind="post", time=t, user=u, polar=polar, score=score, topic=topic)

for c, u, p, t, polar, score, topic in df_comts[["id", "CommenterId", "PostId", "createdAt", "majority_sent_label", "scores", "majority_topic_label"]].values:
    if f"u{u}" in G.nodes and f"p{p}" in G.nodes:
        G.add_node(f"c{c}", id=c, kind="comt", time=t, user=u, polar=polar, score=score, topic=topic)
        G.add_edge(f"c{c}", f"p{p}", kind="known", time=t, weight=1)
        # G.add_edge(f"p{p}", f"c{c}", kind="known", time=t, weight=1)

for l, u, p, t in df_likes[["id", "UserId", "PostId", "createdAt"]].values:
    if f"u{u}" in G.nodes and f"p{p}" in G.nodes:
        G.add_node(f"l{l}", user=f"u{u}", post=f"p{p}", time=t, id=l, kind="like",
                   topic=G.nodes[f"p{p}"]["topic"], polar=G.nodes[f"p{p}"]["polar"], score=1)
        G.add_edge(f"u{u}", f"l{l}", time=t, weight=1, kind="infer")
        G.add_edge(f"l{l}", f"p{p}", time=t, weight=1, kind="known")

for u, p, t, v, s in df_views[["UserId", "PostId", "createdAt", "id", "singleView"]].values:
    if f"u{u}" in G.nodes and f"p{p}" in G.nodes:
        G.add_node(f"v{v}", user=f"u{u}", post=f"p{p}", time=t, id=v, kind="view",
                   topic=G.nodes[f"p{p}"]["topic"], polar=G.nodes[f"p{p}"]["polar"], score=1)
        G.add_edge(f"u{u}", f"v{v}", time=t, weight=1, kind="infer")
        G.add_edge(f"v{v}", f"p{p}", time=t, weight=1, kind="infer")
        if s == "t":
            for c in post_comments[f"p{p}"]:
                if c in G.nodes:
                    G.add_edge(f"v{v}", c, time=t, weight=1, kind="infer")
user_view = {u: set() for u in G if u[0] == "u"}
for u, p, t, v, s in df_views[["UserId", "PostId", "createdAt", "id", "singleView"]].values:
    if f"u{u}" in G.nodes and f"p{p}" in G.nodes and s != "t":
        user_view[f"u{u}"].add(f"v{v}")

single_view = {u: set() for u in G if u[0] == "u"}
for u, p, t, v, s in df_views[["UserId", "PostId", "createdAt", "id", "singleView"]].values:
    if f"u{u}" in G.nodes and f"p{p}" in G.nodes and s == "t":
        single_view[f"u{u}"].add(f"v{v}")

print(f"user view: {sum([len(user_view[u]) for u in user_view])}")
print(f"single view: {sum([len(single_view[u]) for u in single_view])}")

for p in tqdm(G):
    if p[0] in ["p", "c"]:
        u = f"u{G.nodes[p]['user']}"
        for v in user_view[u] | single_view[u]:
            if G.nodes[v]["time"] <= G.nodes[p]["time"]:
                G.add_edge(p, v, time=G.nodes[v]["time"], weight=1, kind="infer")

In [None]:
with open("res/pagerank/0.85-0.5-0.5.pkl", "rb") as fp:
    d = pickle.load(fp)

In [None]:
df_users[df_users["id"] == 2340]

In [None]:
user_list = df_users["id"].tolist()
# user_list = df_users[df_users["isObserver"] == "f"]["id"].tolist()

In [None]:
obsr_list = [f"u{u}" for u in df_users[df_users["isObserver"] == "t"]["id"]]
pr_alpha = 0.85
beta_a = 0.5
beta_b = 0.5
keyq = [(node_name, t, topic, polar, pr_alpha, beta_a, beta_b)
        for node_name in obsr_list for t in range(1,6) for topic in range(8) for polar in [0, 2]
       ]

keyq = list(filter(lambda x: get_change(x[0], num2topic[x[2]])[x[1]-1]*(1-x[3]) > 0, tqdm(keyq)))

In [None]:
property_full = {k: d[k]["full_graph"] for k in tqdm(keyq)}
property_exps = {k: d[k]["exps_graph"] for k in tqdm(keyq)}

df_full = pd.DataFrame.from_dict(property_full, orient="index")
df_exps = pd.DataFrame.from_dict(property_exps, orient="index")

In [None]:
use_color = sns.color_palette("Set1")
use_color

In [None]:
pd.DataFrame.from_dict({
    ("Full Influence Graph", "mean"): df_full.mean(axis=0),
    ("Full Influence Graph", "std"): df_full.std(axis=0),
    ("Expression only Influence Graph", "mean"): df_exps.mean(axis=0),
    ("Expression only Influence Graph", "std"): df_exps.std(axis=0),
}).T

In [None]:
pab = [(p, a, b)
    for p in [0.85, 0.9, 0.7, 0.5, 0.3, 0.1]
    for a in [0.5, 0.9, 0.7, 0.3, 0.1]
    for b in [0.5, 0.9, 0.7, 0.3, 0.1]
]
obsr_list = [f"u{u}" for u in df_users[df_users["isObserver"] == "t"]["id"]]

In [None]:
def pr2rank(pr_value):
    ret = {u: 0 for u in user_list}
    for n in pr_value:
        if n[0] in ["p", "c"]:
            ret[G.nodes[n]["user"]] += pr_value[n]
    rank_value = [-ret[u] for u in user_list]
    rank = rankdata(rank_value, "max")
    # print(rankdata(rank, "min"))
    return dict(zip(user_list, rank))

def get_params(p, a, b):
    fpath = Path(f"res/pagerank/{p}-{a}-{b}.pkl")
    if not fpath.exists():
        return None
    with open(fpath, "rb") as fp:
        d = pickle.load(fp)
    keyq = [(node_name, t, topic, polar, p, a, b)
            for node_name in obsr_list for t in range(1, 6) for topic in range(8) for polar in [0, 2]
            ]
    keyq = list(filter(lambda x: get_change(x[0], num2topic[x[2]])[x[1]-1]*(1-x[3]) > 0, keyq))
    ranks = [(k, pr2rank(d[k]["pr"])) for k in keyq]
    return ranks

ret = [get_params(*tup) for tup in tqdm(pab)]

In [None]:
keys = [x[0] for r in ret if r is not None for x in r]
values = [x[1] for r in ret if r is not None for x in r]
rank_dict = dict(zip(keys, values))

df_inf = pd.DataFrame.from_dict(rank_dict, orient="index")
display(df_inf.head())

In [None]:
avg_inf = df_inf.mean(axis=0, level=[1, 4, 5, 6])
display(avg_inf.head(10))

In [None]:
pab_done = set([k[4:] for k in keys])
pab_done = [tup for tup in pab if tup in pab_done]

In [None]:
df_user_express = pd.read_csv("./DSock/User_Expressed_Influence_Rank.csv")
display(df_user_express.head())

df_active_user_express = pd.read_csv("./DSock/Active_User_Expressed_Influence_Rank.csv")
display(df_active_user_express.head())

In [None]:
def jaccard(col1, col2, num=10):
    r1 = set(col1.nlargest(num).index)
    r2 = set(col2.nlargest(num).index)
    ret = len(r1&r2)/len(r1|r2) if len(r1|r2) > 0 else np.nan
    return 

def rank_metrics(col1, col2):
    ret1 = {
        "spearman": dict(zip(["correlation", "pvalue"], spearmanr(col1, col2))),
        "kendaltau": dict(zip(["correlation", "pvalue"], kendalltau(col1, col2))),
    }
    
    ret2 = {
        f"jaccard-{j}": {"correlation": jaccard(col1, col2, j), "pvalue": None}
        for j in [10, 20, 30, 40, 50, 100]
    }
    
    return {**ret1, **ret2}

# print(jaccard(df_user_express["per_rank"], df_cor[(0.85, 0.5,0.5)].rank(ascending=False, pct=True, method="max")))
# print(rank_metrics(-df_user_express["per_rank"], df_cor[(0.85, 0.5, 0.5)]))

In [None]:
pd.set_option('precision', 3)

In [None]:
df_cor = {tup: [avg_inf.loc[(t, *tup), n] if (t, *tup) in avg_inf.index else None for n, t in df_user_express[["id", "survey"]].values] for tup in tqdm(pab_done)}
df_cor = pd.DataFrame.from_dict(df_cor)
df_cor = pd.concat([df_cor, df_user_express], axis=1).dropna(axis=0)

res_all = {tup: rank_metrics(df_cor["per_rank"], df_cor[tup].rank(ascending=False, pct=True, method="max")) for tup in tqdm(pab_done)}
df_res = pd.concat([pd.DataFrame(res_all[tup]) for tup in tqdm(pab_done)], keys=pab_done)

print(df_res.loc[(slice(None), slice(None), slice(None), "correlation")].max())
display(df_res)

idxmax = df_res.loc[(slice(None), slice(None), slice(None), "correlation")].idxmax()
print(idxmax)

display(pd.concat([df_res.loc[(*ind, slice(None)), :] for ind in set(idxmax)], axis=0))

In [None]:
df_cor = {tup: [avg_inf.loc[(t, *tup), n] if (t, *tup) in avg_inf.index else None for n, t in df_active_user_express[["id", "survey"]].values] for tup in tqdm(pab_done)}

df_cor = pd.DataFrame.from_dict(df_cor)
df_cor = pd.concat([df_cor, df_user_express], axis=1).dropna(axis=0)

res_all = {tup: rank_metrics(df_cor["mentions"], df_cor[tup].rank(ascending=False, pct=True, method="max")) for tup in tqdm(pab_done)}
df_res = pd.concat([pd.DataFrame(res_all[tup]) for tup in tqdm(pab_done)], keys=pab_done)
print(df_res.loc[(slice(None), slice(None), slice(None), "correlation")].max())
display(df_res)

idxmax = df_res.loc[(slice(None), slice(None), slice(None), "correlation")].idxmax()
print(idxmax)

display(pd.concat([df_res.loc[(*ind, slice(None)), :] for ind in set(idxmax)], axis=0))