In [1]:
from notebook_utils import setup
import pandas as pd
import networkx as nx
from collections import defaultdict
import time
import networkit

setup()

import matplotlib.style as style

In [2]:
DATE = "16-dec"
DATA_DIR = "../data/{}/".format(DATE)
EXPORT_DIR = "../data/dataframes/{}/".format(DATE)

In [2]:
import pickle 
with open("./cached_graph.pickle", "rb") as f:
    cached_graph = pickle.load(f)

In [3]:
cached_graph.keys()

dict_keys(['all_user_clusters', 'graph', 'node_id_map', 'clustering_directed_unweighted', 'clustering_directed_weighted', 'clustering_undirected_unweighted'])

In [4]:
node_id_map = cached_graph["node_id_map"]

In [5]:
cached_graph["graph"].nodes[node_id_map["240454812"]]

{'user_id': '240454812', 'label': 'GenFlynn', 'followers': 950567}

In [7]:
print("Total nodes in graph:", len(cached_graph["graph"].nodes))
print("Clustered users:", len(cached_graph["clustering_directed_unweighted"][0]))

Total nodes in graph: 1887736
Clustered users: 1885600


In [6]:
node_community_map = cached_graph["clustering_directed_weighted"][0]

def get_graph_with_communities(G, node_community_map):
    nodes_to_keep = []
    '''Add community to node attributes'''
    for n, c in node_community_map.items():
        if (c < 5):
            G.nodes[n]['community'] = c
            nodes_to_keep.append(n)
    return G.subgraph(nodes_to_keep)

graph_with_communities = get_graph_with_communities(cached_graph["graph"], node_community_map).copy
print(len(graph_with_communities.nodes))
print(graph_with_communities.nodes[0])

1697944
{'user_id': '1097618307778150400', 'label': 'PacificReports', 'followers': 10043, 'community': 2}


In [9]:
import pickle 
with open("./graph_with_communities.pickle", "wb") as f:
    pickle.dump(graph_with_communities.copy(), f)

In [2]:
import pickle 
with open("./graph_with_communities.pickle", "rb") as f:
    graph_with_communities = pickle.load(f)

In [31]:
def get_largest_subgraph(graph):
    undirected_graph = graph.to_undirected()
    connected_components = [graph.subgraph(c) for c in nx.connected_components(undirected_graph)]
    largest_component = max(connected_components, key=len)
    return graph.subgraph(largest_component.nodes)

In [3]:
nx_to_nk = {}

def to_networkit_graph(nx_graph):
    id_map = dict((idx, node) for (idx, node) in enumerate(nx_graph.nodes()))

    if (nx_graph not in nx_to_nk):
        nx_to_nk[nx_graph] = networkit.nxadapter.nx2nk(nx_graph, weightAttr="weight")

    return nx_to_nk[nx_graph], id_map

In [4]:
def split_community_graphs (G):
    l_nodes = []
    r_nodes = []

    for n, data in G.nodes(data=True):
        if (data["community"] == 0):
            l_nodes.append(n)
        else:
            r_nodes.append(n)
    return G.subgraph(l_nodes), G.subgraph(r_nodes)

l_graph, r_graph = split_community_graphs(graph_with_communities)

In [5]:
print("Left graph")
print("Nodes: {}, edges: {}".format(len(l_graph.nodes), len(l_graph.edges)))
print("Right graph")
print("Nodes: {}, edges: {}".format(len(r_graph.nodes), len(r_graph.edges)))

Left graph
Nodes: 860976, edges: 3247717
Right graph
Nodes: 836968, edges: 11342929


In [6]:
def top_closeness(nx_graph, k=15):
    nk_graph, id_map = to_networkit_graph(nx_graph)

    start = time.time()
    top_closeness = networkit.centrality.TopCloseness(nk_graph, k, True, True)
    top_closeness.run()
    end = time.time()


    top_scores_by_node_id = {}
    for idx, score in zip(top_closeness.topkNodesList(), top_closeness.topkScoresList()):
        top_scores_by_node_id[id_map[idx]] = score
    print("Time elapsed: {}s".format(end - start))
    return top_scores_by_node_id

In [7]:
l_closeness = top_closeness(l_graph, 10000)
l_closeness

Time elapsed: 572.6662971973419s


{119: 0.18513846984211368,
 204: 0.17855526132884464,
 271: 0.17735512015740376,
 284: 0.1768810406179521,
 449: 0.17315885302412262,
 1626: 0.17294491445814608,
 2146: 0.17221641804619442,
 263: 0.17096122428456217,
 431: 0.16922017598390288,
 201: 0.16726690365648858,
 2397: 0.1671504173427108,
 319: 0.16657699655790767,
 1811: 0.16470459149670305,
 356: 0.16418341423327304,
 3046: 0.1637922081248099,
 340: 0.16293364842475336,
 250: 0.1629217287035964,
 172: 0.16259234214481713,
 294: 0.16224374020989257,
 2298: 0.16200465652431606,
 2333: 0.16188764824036156,
 437: 0.16154742813322914,
 287: 0.16022205083991742,
 799: 0.16004895235635697,
 1615: 0.1599876526582014,
 2021: 0.15980511579931334,
 167: 0.1596560175270101,
 2734: 0.15925934506481776,
 3787: 0.15811690232281034,
 634: 0.1580303475907022,
 351: 0.1577513741338133,
 2266: 0.15722078638578585,
 260: 0.156891984327749,
 288: 0.15649500654094944,
 3931: 0.1560967501773229,
 544: 0.15604526548274592,
 3039: 0.15549047211384104

In [8]:
r_closeness = top_closeness(r_graph, 10000)
r_closeness

Time elapsed: 2367.8155460357666s


{4: 0.45482667262739335,
 78: 0.3980714661306678,
 19: 0.3670284768322196,
 18: 0.36400116249653847,
 28: 0.36090868944206167,
 95: 0.36051767491195835,
 32: 0.35038338195855334,
 17: 0.3477682896684846,
 141: 0.345512206298572,
 11: 0.34160602684245495,
 41: 0.33869387414683777,
 981: 0.3364463048861217,
 9: 0.3363739748410226,
 650: 0.3362558712483766,
 14: 0.33414765369322513,
 31: 0.3336322420932986,
 58: 0.3310384443714837,
 27: 0.33047216484411435,
 20: 0.3302012048691551,
 613: 0.32991163074105995,
 598: 0.3289625574382371,
 612: 0.32660085881512024,
 608: 0.32572018344010956,
 605: 0.32472738355581404,
 473: 0.32440942305153836,
 2283: 0.3239820263899757,
 30: 0.3227745466797134,
 23: 0.3219139051892004,
 21: 0.3218865825340888,
 715: 0.3203937017887462,
 767: 0.32019856540004293,
 3082: 0.3195981580688716,
 1101: 0.31833293866033724,
 148: 0.3167387708080374,
 15: 0.31613009822801325,
 26: 0.3157441796190069,
 13: 0.3153183662488026,
 98: 0.3149142193295207,
 25: 0.31420469589

In [11]:
import pickle 
with open("./closeness.pickle", "wb") as f:
    pickle.dump({
        "l_closeness": l_closeness,
        "r_closeness": r_closeness
    }, f)

In [13]:
import pickle 
with open("./closeness.pickle", "rb") as f:
    closeness = pickle.load(f)

In [10]:
df_users_with_clustering = pd.read_pickle("./df_users_with_clustering.pickle")
df_users_with_clustering.head()

Unnamed: 0_level_0,protected,friends,created_at,name,friends_count,verified,followers_count,location,followed_cnts,handle,url,clustering_directed_unweighted,clustering_directed_weighted,clustering_undirected_unweighted
datastore_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1334881524664193027,False,,2020-12-04T15:25:47Z,venton talley,8,False,3,,0,TalleyVenton,,-1,-1,-1
1334881805024043010,False,,2020-12-04T15:28:30Z,🇺🇸Freedom Fighter🇺🇸,1028,False,41,MAGA Country,0,RiSeAgAiN888,,35043,2,0
1334882729859682310,False,,2020-12-04T15:30:26Z,Hypocritical Liberal Wannabe TB Aidan,0,False,2,,0,LiberalTb,,-1,-1,-1
1334883015265226752,False,,2020-12-04T15:32:10Z,Conversing Post,128,False,2,,0,ConversingPost,,-1,-1,-1
1334883045988474886,False,,2020-12-04T15:31:56Z,Ajmira sultana,62,False,17,,0,SultanaAjmira,,-1,-1,-1


In [160]:
for n, score in r_closeness.items():
    print(graph_with_communities.nodes[n])

{'user_id': '25073877', 'label': 'realDonaldTrump', 'followers': 87364085, 'community': 1}
{'user_id': '187680645', 'label': 'LLinWood', 'followers': 216826, 'community': 1}
{'user_id': '770781940341288960', 'label': 'RudyGiuliani', 'followers': 849467, 'community': 1}
{'user_id': '26487169', 'label': 'LouDobbs', 'followers': 2263546, 'community': 1}
{'user_id': '4041824789', 'label': 'RSBNetwork', 'followers': 181041, 'community': 1}
{'user_id': '240454812', 'label': 'GenFlynn', 'followers': 950567, 'community': 1}
{'user_id': '586707638', 'label': 'SidneyPowell1', 'followers': 482623, 'community': 1}
{'user_id': '2853461537', 'label': 'ScottAdamsSays', 'followers': 596514, 'community': 1}
{'user_id': '16989178', 'label': 'JamesOKeefeIII', 'followers': 787461, 'community': 1}
{'user_id': '18266688', 'label': 'TomFitton', 'followers': 1210705, 'community': 1}


In [161]:
for n, score in l_closeness.items():
    print(graph_with_communities.nodes[n])

{'user_id': '15952856', 'label': 'AriBerman', 'followers': 160928, 'community': 0}
{'user_id': '3622368202', 'label': 'JohnFetterman', 'followers': 73038, 'community': 0}
{'user_id': '14529929', 'label': 'jaketapper', 'followers': 2793144, 'community': 0}
{'user_id': '32871086', 'label': 'kylegriffin1', 'followers': 933613, 'community': 0}
{'user_id': '233842454', 'label': 'justinamash', 'followers': 462845, 'community': 0}
{'user_id': '184860130', 'label': 'TimAlberta', 'followers': 103413, 'community': 0}
{'user_id': '39155029', 'label': 'mkraju', 'followers': 462443, 'community': 0}
{'user_id': '20118080', 'label': 'BrendanKeefe', 'followers': 5449, 'community': 0}
{'user_id': '32551884', 'label': 'alanfeuer', 'followers': 22760, 'community': 0}
{'user_id': '22129280', 'label': 'jimsciutto', 'followers': 426186, 'community': 0}
{'user_id': '16563015', 'label': 'bluestein', 'followers': 62521, 'community': 0}
{'user_id': '807095', 'label': 'nytimes', 'followers': 47663855, 'community

In [13]:
fast_graphs = [networkit.nxadapter.nx2nk(g, weightAttr="weight") for g in community_subgraphs]

In [49]:
community_subgraphs[4].nodes()[2336]

{'user_id': '98294131', 'label': '1VAFI', 'followers': 2240, 'community': 4}

In [72]:
centralities = nx.algorithms.centrality.closeness_centrality(community_subgraphs[4].reverse(), wf_improved=True)

In [74]:
max_cent = 0
for n, c in centralities.items():
    if (c > max_cent):
        max_cent = c
max_cent



0.1650349316983805

In [77]:
centralities[id_map[460]]

0.1650349316983805

In [67]:
len(community_subgraphs[4].nodes)

23414

In [81]:
start = time.time()
Closeness = networkit.centrality.Closeness(fast_graphs[2], True, 1)
Closeness.run()
# Closeness.ranking()[:10]
end = time.time()
print("Time taken", end - start)

In [37]:
community_graphs[1].nodes[node_id_map["25073877"]]

{'user_id': '25073877',
 'label': 'realDonaldTrump',
 'followers': 87364085,
 'community': 1}

In [15]:
def time_func(func):
    def timed(*args):
        start = time.time()
        ret = func(*args)
        end = time.time()
        print("Time taken", end - start)
        return ret
    return timed

In [16]:
timed_centrality = time_func(nx.algorithms.centrality.closeness_centrality)

In [72]:
timed_centrality(community_graphs[4], 1014567)

Time taken 0.5843698978424072


0.0002485021918669892

In [17]:
print("Nodes:", len(community_graphs[4].nodes))
print("Edges", len(community_graphs[4].edges))
print("Normal")
commmunity_graphs_4_closeness = timed_centrality(community_graphs[4])

NameError: name 'community_graphs' is not defined

In [76]:
print(len(community_graphs[4].nodes), len(community_graphs[4].edges))

23414 47261


In [77]:
commmunity_graphs_3_closeness = timed_centrality(community_graphs[3])

Time taken 12.953757762908936


In [78]:
print(len(community_graphs[3].nodes), len(community_graphs[3].edges))

33587 104176


In [79]:
print("Nodes:", len(community_graphs[2].nodes))
print("Edges", len(community_graphs[2].edges))
commmunity_graphs_2_closeness = timed_centrality(community_graphs[2])

Nodes: 342184
Edges 3759790


KeyboardInterrupt: 

In [56]:
node_closeness = nx.algorithms.centrality.closeness_centrality(community_graphs[4], 1014567)
node_closeness

0.0002485021918669892

In [44]:
node_closeness = nx.algorithms.centrality.closeness_centrality(community_graphs[1], node_id_map["25073877"])
node_closeness

0.0013523511882467215

In [None]:
import numpy as np
import scipy.sparse
import scipy.sparse.csgraph

In [30]:
def get_largest_subgraph(graph):
    undirected_graph = graph.to_undirected()
    connected_components = [graph.subgraph(c) for c in nx.connected_components(undirected_graph)]
    largest_component = max(connected_components, key=len)
    return graph.subgraph(largest_component.nodes)

In [12]:
largest_subgraph = get_largest_subgraph(cached_graph["graph"].to_undirected())

In [13]:
print("Nodes in largest subgraph:", len(largest_subgraph.nodes))

Nodes in largest subgraph: 1854896


In [6]:
def print_cluster_stats(df_users, cluster_type, N=5, top_users=10):
    community_node_map = cached_graph[cluster_type][1]
    for c in range(N): 
        cluster_users = df_users[df_users[cluster_type] == c]
        cluster_nodes = len(community_node_map[c])
        print("Cluster {} (user ids in cluster: {} ({:,.1f}%), identified users: {})".format(
            c, 
            cluster_nodes, 
            (cluster_nodes / len(cached_graph["graph"].nodes)) * 100,
            cluster_users.shape[0])
        )
        print("Top users")
        print(cluster_users[["handle", "followers_count"]][:top_users])
        print("Follower count:")
        print("- mean {:,.1f}".format(cluster_users["followers_count"].mean()))
        print("- min {:,}".format(cluster_users["followers_count"].min()))
        print("- max {:,}".format(cluster_users["followers_count"].max()))
        print("- median {:,.0f}".format(cluster_users["followers_count"].median()))
        print()

def cluster_users(df_users, node_id_map, node_community_map):
    clusters = []
    missing_in_graph = 0
    missing_assignment = 0
    for user_id, user in df_users.iterrows():
        if user_id in node_id_map:
            node_id = node_id_map[user_id]
            if node_id in node_community_map:
                cluster = node_community_map[node_id]
                clusters.append(cluster)
            else:
                missing_assignment += 1
                clusters.append(-1)    
        else:
            missing_in_graph += 1
            clusters.append(-1)
    print("{} missing assignment".format(missing_assignment))
    print("{} missing in graph".format(missing_in_graph))
    return clusters

In [20]:
df_all_users = pd.read_pickle(EXPORT_DIR + 'df_users.pickle')

In [21]:
# Cluster users
df_all_users["clustering_directed_unweighted"] = cluster_users(df_all_users, node_id_map, cached_graph["clustering_directed_weighted"][0])
df_all_users["clustering_directed_weighted"] = cluster_users(df_all_users, node_id_map, cached_graph["clustering_directed_unweighted"][0])

2122 missing assignment
671282 missing in graph
2122 missing assignment
671282 missing in graph


In [28]:
df_all_users["clustering_undirected_unweighted"] = cluster_users(df_all_users, node_id_map, cached_graph["clustering_undirected_unweighted"][0])

2122 missing assignment
671282 missing in graph


In [7]:
df_all_users_with_clustering = pd.read_pickle("./df_users_with_clustering.pickle")

In [42]:
def get_all_users_by_cluster(cluster_type):
    # All users
    all_users_by_cluster = df_all_users.sort_values([cluster_type, "followers_count"], ascending=False).reset_index().set_index(cluster_type)
    all_users_by_cluster["user"] = all_users_by_cluster["datastore_id"]
    return all_users_by_cluster

all_users_by_cluster = get_all_users_by_cluster("cluster_directed_unweighted")

In [117]:
# Top users
df_weekly_top_users = pd.read_pickle(EXPORT_DIR + 'df_weekly_top_users.pickle')
df_top_users = df_weekly_top_users.groupby(["user", "handle", "name", "followers_count", "verified"]).sum().reset_index().set_index("user")
df_top_users["cluster"] = cluster_users(df_top_users, node_id_map, node_community_map)
top_users_by_cluster = df_top_users.sort_values(["cluster", "followers_count"], ascending=[True, False]).reset_index().set_index("cluster")

18 missing assignment
21 missing in graph


In [8]:
print_cluster_stats(df_all_users_with_clustering.sort_values("followers_count", ascending=False), "clustering_directed_weighted", 5, 25)

Cluster 0 (user ids in cluster: 860976 (45.6%), identified users: 253438)
Top users
                       handle  followers_count
datastore_id                                  
428333                 cnnbrk         59420316
759251                    CNN         50395181
807095                nytimes         47663855
5402612           BBCBreaking         46468749
1339835893     HillaryClinton         29737638
742143               BBCWorld         29485728
5988062          TheEconomist         25123658
1652541               Reuters         22518102
16303106        StephenAtHome         19138480
3108351                   WSJ         18173551
14293310                 TIME         17528521
2467791        washingtonpost         16493705
91478624               Forbes         16368043
28785486                  ABC         16013155
51241574                   AP         14507714
18948541       SethMacFarlane         13985869
37034483                 ndtv         13913844
95023423            Ube

In [67]:
df_all_users.to_pickle("./df_users_with_clustering.pickle")

In [138]:
df_recent_tweets = pd.read_pickle(EXPORT_DIR + 'df_recent_tweets.pickle')

In [None]:
df_recent_tweets["community"] = df_recent_tweets.apply(lambda x: )

In [172]:
cached_graph["graph"].edges(node_id_map["90573676"])

InEdgeDataView([(185, 423932), (3019, 423932), (4205, 423932), (211, 423932), (78, 423932), (6600, 423932), (271, 423932)])

In [19]:
node_id_map["90573676"]

423932

In [7]:
def print_node_edges(user_id):
    out_edges = defaultdict(lambda: 0)
    in_edges = defaultdict(lambda: 0)
    print("Out edges")
    for (source, target) in cached_graph["graph"].out_edges(node_id_map[user_id]):
        cluster = node_community_map[target]
        if (cluster < 5):
            #print(cached_graph["graph"].nodes[target])
            out_edges[cluster] += 1
        else:
            out_edges["fringe"] += 1
    print(dict(out_edges))
    print("In edges")
    for (source, target) in cached_graph["graph"].in_edges(node_id_map[user_id]):
        cluster = node_community_map[source]
        if (cluster < 5):
            #print(cached_graph["graph"].nodes[target])
            in_edges[cluster] += 1
        else:
            in_edges["fringe"] += 1
    print(dict(in_edges))

In [65]:
# Jerry Saltz (Left-leaning)
print_node_edges("90573676")

Out edges
{0: 3, 2: 4}
In edges
{0: 6, 1: 1}


In [66]:
# tveitdal (Left-leaning)
print_node_edges("75742264")

Out edges
{0: 3, 2: 4, 'fringe': 1}
In edges
{2: 1}


In [42]:
# Press Sec (Right-leaning)
print_node_edges("818927131883356161")

Out edges
{2: 2, 470: 1}
In edges
{}


In [61]:
# Jim Jordan (Cluster 2, right-leaning)
print_node_edges("18166778")

Out edges
{2: 1}
In edges
{}


In [63]:
# Toby Turner (Cluster 2, right-leaning)
print_node_edges("6054912")

Out edges
{2: 2}
In edges
{}


In [None]:
# Press Sec (Cluster 2, right-leaning)
print_node_edges("")

In [10]:
def sample_prominent_nodes(graph):
    high_follower_nodes = [x for x,y in graph.nodes(data=True) if 'followers' in y and y['followers']>25000]
    high_follower_graph = graph.subgraph(high_follower_nodes)

    prominent_nodes = [node for (node, degrees) in graph_with_communities.degree() if degrees > 100]

    return high_follower_graph.subgraph(prominent_nodes)

In [11]:
prominent_graph = sample_prominent_nodes(graph_with_communities)

In [12]:
print(len(prominent_graph.nodes))
print(len(prominent_graph.edges))

4324
156723


In [29]:
print(len(prominent_graph.nodes))
print(len(prominent_graph.edges))

8630
186116


In [25]:
import random

def sample_connected_graph(graph, k = 100000):
    sampled_nodes = random.sample(graph_with_communities.nodes, k)
    sampled_graph = graph_with_communities.subgraph(sampled_nodes + list(prominent_graph.nodes())))

    connected_nodes_sample = get_largest_subgraph(sampled_graph.to_undirected())

    return graph_with_communities.subgraph(connected_nodes_sample)

In [26]:
sampled_graph = sample_connected_graph(graph_with_communities, 50000)
print(len(sampled_graph.nodes))
print(len(sampled_graph.edges))

44474
456372


In [32]:
sampled_graph = sample_connected_graph(graph_with_communities, 150000)
print(len(sampled_graph.nodes))
print(len(sampled_graph.edges))

58854
980496


In [1]:
sampled_graph.nodes[node_id_map["25073877"]]

NameError: name 'sampled_graph' is not defined

In [27]:
filename = "../data/graphs/08-jan-sampled-graph-with-communities.gexf"
nx.write_gexf(sampled_graph, filename)
print("Exported to {}".format(filename))

Exported to ../data/graphs/08-jan-sampled-graph-with-communities-smaller.gexf
