In [1]:
from notebook_utils import setup, load_tweet_df, load_media_df
import numpy as np
from scipy import stats
from operator import itemgetter

setup()

In [2]:
from data_tools import load_crawled_terms, load_parsed_data

crawled_terms, crawled_hashtags, crawled_phrases = load_crawled_terms("../keywords-3nov.txt", split_hashtags=True)

cast_cols = {
    "tweet_count": "int32",
    "quote_count": "int32" 
}
for term in crawled_terms:
    cast_cols[term] = "Sparse[int8]"

tweet_df = load_parsed_data('../data/14-nov/parsed_tweets.json', exclude_cols={
    "cleaned_text", 
    "entities",
    "replyTo",
    "replyTo_user",
    "text", 
    "last_retweeted", 
    "place", 
    "processed",
    "media", 
    "isDeleted"
}, cast_cols=cast_cols, verbose=True)

tweet_df.info()

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (2.5799708366394043 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (3.266613006591797 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (2.7051432132720947 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (3.0368881225585938 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (3.1282520294189453 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (3.4758729934692383 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (3.7959330081939697 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (2.4272072315216064 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (4.138207912445068 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (2.437023878097534 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.json

In [3]:
retweet_df = load_parsed_data('../data/14-nov/parsed_retweets.json')

retweet_df.info()

Loading 8044982 json lines
(1%): 100000 lines in ../data/14-nov/parsed_retweets.json processed (0.41941189765930176 sec)
(2%): 200000 lines in ../data/14-nov/parsed_retweets.json processed (0.6255569458007812 sec)
(4%): 300000 lines in ../data/14-nov/parsed_retweets.json processed (0.43156933784484863 sec)
(5%): 400000 lines in ../data/14-nov/parsed_retweets.json processed (0.4770238399505615 sec)
(6%): 500000 lines in ../data/14-nov/parsed_retweets.json processed (0.4291038513183594 sec)
(7%): 600000 lines in ../data/14-nov/parsed_retweets.json processed (0.4238567352294922 sec)
(9%): 700000 lines in ../data/14-nov/parsed_retweets.json processed (0.3710439205169678 sec)
(10%): 800000 lines in ../data/14-nov/parsed_retweets.json processed (0.37425804138183594 sec)
(11%): 900000 lines in ../data/14-nov/parsed_retweets.json processed (0.46216392517089844 sec)
(12%): 1000000 lines in ../data/14-nov/parsed_retweets.json processed (0.4916369915008545 sec)
(14%): 1100000 lines in ../data/14-

In [4]:
cast_cols = {
    "followed_cnts": "int32",
    "friends_count": "int32",
    "followers_count": "int32" 
}

user_df = load_parsed_data('../data/14-nov/parsed_users.json', cast_cols=cast_cols, exclude_cols={"description"}, index_col="datastore_id")

user_df.info()

Loading 806800 json lines
(12%): 100000 lines in ../data/14-nov/parsed_users.json processed (1.343425989151001 sec)
(25%): 200000 lines in ../data/14-nov/parsed_users.json processed (1.1430909633636475 sec)
(37%): 300000 lines in ../data/14-nov/parsed_users.json processed (1.0825738906860352 sec)
(50%): 400000 lines in ../data/14-nov/parsed_users.json processed (1.0962812900543213 sec)
(62%): 500000 lines in ../data/14-nov/parsed_users.json processed (1.0596449375152588 sec)
(74%): 600000 lines in ../data/14-nov/parsed_users.json processed (1.1406810283660889 sec)
(87%): 700000 lines in ../data/14-nov/parsed_users.json processed (1.1018080711364746 sec)
(99%): 800000 lines in ../data/14-nov/parsed_users.json processed (1.0810511112213135 sec)
Done loading ../data/14-nov/parsed_users.json
806800 lines in ../data/14-nov/parsed_users.json processed (9.147326946258545 sec)
<class 'pandas.core.frame.DataFrame'>
Index: 806800 entries, 1199502560928907265 to 147207892
Data columns (total 11 c

# Build Retweet graph


In [345]:
import networkx as nx



def add_user_to_graph(graph, user_id):
    graph.add_node(
        user_id,
        label=user_df.at[user_id, "handle"],
        followers=user_df.at[user_id, "followers_count"]
    )

def build_graph(N = 10000, directed=False):
    graph = nx.DiGraph() if directed else nx.Graph()
    print("Building graph...")
    known = 0
    unknown = 0
    dup_edges = set()
    dups = 0
    i = 0

    retweet_rows = zip(retweet_df["retweetedFrom_user"][:N], retweet_df["user"])

    for retweet_author, retweet_user in retweet_rows:
        if not graph.has_node(retweet_author):
            if retweet_author in user_df.index:
                add_user_to_graph(graph, retweet_author)
                known += 1
            else:
                graph.add_node(retweet_author)
                unknown += 1
        if not graph.has_node(retweet_user):
            if retweet_user in user_df.index:
                add_user_to_graph(graph, retweet_user)
                known += 1
            else:
                graph.add_node(retweet_user)
                unknown += 1
        dup_key = str(retweet_user) + str(retweet_author)
        if (dup_key not in dup_edges):
            graph.add_edge(retweet_user, retweet_author)
            dup_edges.add(dup_key)
        else: 
            dups += 1
        
        if (N > 200000 and i % 100000 == 0):
            print("{}/{}".format(i, N))
        i += 1

    print("{} known users".format(known))
    print("{} unknown users".format(unknown))
    print("{} duplicate edges".format(dups))
    print("Done building graph")
    return graph

In [269]:
from cdlib.algorithms import infomap
def get_largest_subgraph(graph):
    connected_components = [graph.subgraph(c) for c in nx.connected_components(graph)]
    return max(connected_components, key=len)

def print_graph_stats(graph):
    print(f"There are {graph.number_of_nodes()} nodes and {graph.number_of_edges()} \
edges in the Graph")

def detect_communities(graph):
    clustering = infomap(graph)
    print("Found {} communities".format(len(clustering.communities)))
    return clustering.to_node_community_map(), clustering.communities


def set_node_community(G, communities):
    '''Add community to node attributes'''
    for c, v_c in enumerate(communities):
        for v in v_c:
            # Add 1 to save 0 for external edges
            G.nodes[v]['community'] = c + 1

def set_edge_community(G):
    '''Find internal edges and add their community to their attributes'''
    for v, w, in G.edges:
        if G.nodes[v]['community'] == G.nodes[w]['community']:
            # Internal edge, mark with community
            G.edges[v, w]['community'] = G.nodes[v]['community']
        else:
            # External edge, mark as 0
            G.edges[v, w]['community'] = 0

def get_color(i, r_off=1, g_off=1, b_off=1):
    '''Assign a color to a vertex.'''
    r0, g0, b0 = 0, 0, 0
    n = 16
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)



def add_communities_to_graph(graph, communities):
    set_node_community(graph, communities)
    set_edge_community(graph)

def plot_graph_with_communities(graph):
    '''Should only be done for a smaller graph'''
    plt.rcParams.update(plt.rcParamsDefault)
    plt.rcParams.update({'figure.figsize': (15, 10)})
    plt.style.use('default')
    pos = nx.spring_layout(graph, k=0.01, iterations=50)

    # Set community color for internal edges
    external = [(v, w) for v, w in graph.edges if graph.edges[v, w]['community'] == 0]
    internal = [(v, w) for v, w in graph.edges if graph.edges[v, w]['community'] > 0]
    internal_color = ["black" for e in internal]
    node_color = [get_color(graph.nodes[v]['community']) for v in graph.nodes]
    # external edges
    nx.draw_networkx(
        graph, 
        pos=pos, 
        node_size=0, 
        edgelist=external, 
        edge_color="silver",
        node_color=node_color,
        alpha=0.8, 
        with_labels=False)
    # internal edges
    nx.draw_networkx(
        graph, pos=pos, 
        edgelist=internal, 
        edge_color=internal_color,
        node_color=node_color,
        alpha=0.05, 
        with_labels=False)

In [238]:
large_graph = build_graph(200000)
undirected = large_graph.to_undirected()
print_graph_stats(large_graph)
print("...largest subgraph")
largest_subgraph = get_largest_subgraph(undirected)
print_graph_stats(largest_subgraph)

Building graph...
284461 known users
61853 unknown users
34213 duplicate edges
Done building graph
There are 53598 nodes and 165787 edges in the Graph
...largest subgraph
There are 48909 nodes and 163129 edges in the Graph


In [243]:
add_communities_to_graph(largest_subgraph, communities)

In [248]:
# Filter down graph to two largest communities
community_graph = largest_subgraph.copy()
for node, v in community_map.items():
    if (v[0] > 1):
        community_graph.remove_node(node)

print_graph_stats(community_graph)

There are 46522 nodes and 159670 edges in the Graph


# Export Partial Graph for Gephi Drawing

In [249]:
filename = "../data/graphs/graph-with-communities.gexf"
nx.write_gexf(community_graph, filename)
print("Exported to {}".format(filename))

Exported to ../data/graphs/graph-with-communities.gexf


# Cluster Full Graph

In [347]:
size = retweet_df.shape[0]
print("Building graph out of {} retweets".format(size))
full_graph = build_graph(size)
print_graph_stats(full_graph)

Building graph out of 8044982 retweets
Building graph...
0/8044982
100000/8044982
200000/8044982
300000/8044982
400000/8044982
500000/8044982
600000/8044982
700000/8044982
800000/8044982
900000/8044982
1000000/8044982
1100000/8044982
1200000/8044982
1300000/8044982
1400000/8044982
1500000/8044982
1600000/8044982
1700000/8044982
1800000/8044982
1900000/8044982
2000000/8044982
2100000/8044982
2200000/8044982
2300000/8044982
2400000/8044982
2500000/8044982
2600000/8044982
2700000/8044982
2800000/8044982
2900000/8044982
3000000/8044982
3100000/8044982
3200000/8044982
3300000/8044982
3400000/8044982
3500000/8044982
3600000/8044982
3700000/8044982
3800000/8044982
3900000/8044982
4000000/8044982
4100000/8044982
4200000/8044982
4300000/8044982
4400000/8044982
4500000/8044982
4600000/8044982
4700000/8044982
4800000/8044982
4900000/8044982
5000000/8044982
5100000/8044982
5200000/8044982
5300000/8044982
5400000/8044982
5500000/8044982
5600000/8044982
5700000/8044982
5800000/8044982
5900000/804498

In [356]:
print("...largest subgraph")
largest_subgraph_community_map, largest_subgraph_communities = detect_communities(largest_subgraph)
print("Community distribution")
[len(c) for c in communities]

...largest subgraph
Found 126 communities
Community distribution


[645191,
 560052,
 22539,
 18045,
 11377,
 4353,
 4018,
 3889,
 3736,
 2754,
 1417,
 1250,
 1154,
 1154,
 1106,
 812,
 646,
 443,
 438,
 416,
 359,
 342,
 337,
 280,
 264,
 263,
 208,
 192,
 182,
 176,
 170,
 153,
 142,
 142,
 136,
 128,
 121,
 118,
 114,
 105,
 99,
 93,
 87,
 84,
 84,
 79,
 78,
 77,
 72,
 65,
 61,
 60,
 59,
 59,
 58,
 57,
 53,
 52,
 51,
 49,
 49,
 48,
 44,
 40,
 40,
 38,
 36,
 34,
 30,
 30,
 30,
 26,
 26,
 25,
 25,
 24,
 24,
 23,
 22,
 21,
 20,
 20,
 19,
 17,
 16,
 15,
 15,
 13,
 13,
 12,
 12,
 12,
 12,
 11,
 11,
 11,
 10,
 10,
 10,
 10,
 9,
 9,
 8,
 7,
 7,
 7,
 6,
 6,
 5,
 5,
 5,
 4,
 4,
 1]

In [314]:
print_graph_stats(full_graph)

There are 1321586 nodes and 6678842 edges in the Graph


In [348]:
connected_components = [full_graph.subgraph(c) for c in nx.connected_components(full_graph)]
print("There are {} connected components".format(len(connected_components)))

There are 13887 connected components


In [349]:
full_graph_community_map, full_graph_communities = detect_communities(full_graph)

Found 12630 communities


In [355]:
print("...largest subgraph")
largest_subgraph = get_largest_subgraph(full_graph)
print_graph_stats(largest_subgraph)

...largest subgraph
There are 1291086 nodes and 6660681 edges in the Graph


In [357]:
add_communities_to_graph(largest_subgraph, largest_subgraph_communities)

In [362]:
filename = "../data/graphs/full-graph.gexf"
nx.write_gexf(full_graph, filename)
print("Exported to {}".format(filename))

Exported to ../data/graphs/full-graph.gexf


In [363]:
filename = "../data/graphs/full-connected-community-graph.gexf"
nx.write_gexf(largest_subgraph, filename)
print("Exported to {}".format(filename))

Exported to ../data/graphs/full-connected-community-graph.gexf


In [358]:
user_df_with_clustering = user_df.copy()

def get_cluster(user_id):
    if str(user_id) in largest_subgraph_community_map and len(largest_subgraph_community_map[user_id]) > 0:
        return community_map[str(user_id)][0]
    else:
        return np.nan

user_df_with_clustering["infomap_cluster"] = user_df_with_clustering.apply(lambda x: get_cluster(x.name), axis=1)

## Export dataframe

In [359]:
print(user_df_with_clustering.shape)
user_df_with_clustering["infomap_cluster"].value_counts(dropna=False)

(806800, 12)


NaN      414331
0.0      257148
1.0      122868
3.0        3451
2.0        2385
          ...  
53.0          2
98.0          2
94.0          2
44.0          2
107.0         1
Name: infomap_cluster, Length: 114, dtype: int64

In [361]:
user_df_with_clustering.to_csv("../data/dataframes/user_df_with_clustering.csv", index_label="datastore_id")