Reference:

John R. Ladd, Jessica Otis, Christopher N. Warren, and Scott Weingart, "Exploring and Analyzing Network Data with Python," The Programming Historian 6 (2017), https://doi.org/10.46430/phen0064.

https://programminghistorian.org/en/lessons/exploring-and-analyzing-network-data-with-python

In [1]:
import datetime
import functools
import itertools
from operator import itemgetter
import os
from pathlib import Path
import re

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from scipy import stats
import seaborn as sns

from utils import *

%matplotlib notebook

## Build User Graphs

In [18]:
iran['user_mentions']

tweetid
1271764746983952390            []
907991739713118208             []
1277789135470768129            []
137282411095539712             []
1314271851988873251            []
                          ...    
962844249149538304     [10228272]
492029494153052161             []
544193600033013760             []
243819549844193280             []
948848104362663936     [10228272]
Name: user_mentions, Length: 560571, dtype: object

In [39]:
def get_connections(tweets):
    """Generate three sets of connections ((user1, user2), tweetid):
        1. Retweeted: ((['userid'], ['retweet_userid'])['tweetid'])
        2. Replied: ((['userid'], ['in_reply_to_userid']), ['tweetid'])
        3. Mentioned: ((['userid'], ['mentioned_userid']), ['tweetid'])
    """
    rt = tweets[['userid', 'retweet_userid']].dropna().reset_index()
    re = tweets[['userid', 'in_reply_to_userid']].dropna().reset_index()
    
    mentions = (iran['user_mentions']
                # expand usernames into columns
                .explode()
                # melt wide table into duplicated tweets
                .reset_index()
                .melt(id_vars=['tweetid'],
                      value_name='mentioned_userid')
                # clean up
                .astype({'mentioned_userid':'string'})
                .drop(columns=['variable'])
                .join(iran['userid'], on='tweetid')
                .dropna()
               )
    
    rt_graph = nx.from_pandas_edgelist(rt, 
                                       source='userid', 
                                       target='retweet_userid', 
                                       edge_attr='tweetid',
                                       create_using=nx.DiGraph(),
                                      )
    re_graph = nx.from_pandas_edgelist(re, 
                                       source='userid', 
                                       target='in_reply_to_userid', 
                                       edge_attr='tweetid',
                                       create_using=nx.DiGraph(),
                                      )
    mention_graph = nx.from_pandas_edgelist(mentions, 
                                           source='userid', 
                                           target='mentioned_userid', 
                                           edge_attr='tweetid',
                                           create_using=nx.DiGraph(),
                                           )
    return rt_graph, re_graph, mention_graph

In [3]:
def add_node_attributes(G):
    """Compute various graph metrics and add to node attributes.
    """
    # degree
    nx.set_node_attributes(G,
                           dict(G.out_degree(G.nodes())), 
                           'out_degree')
    nx.set_node_attributes(G,
                           dict(G.in_degree(G.nodes())), 
                           'in_degree')
    
    # eigenvector centrality
    nx.set_node_attributes(G, 
                           nx.eigenvector_centrality(G),
                           'eigenvector')
    
    # betweenness centrality
    nx.set_node_attributes(G,
                           nx.betweenness_centrality(G), 
                           'betweenness')
    
    # degree centrality
    nx.set_node_attributes(G,
                           nx.degree_centrality(re),
                           'degree_centrality')

In [4]:
def top_nodes(data_dict, limit=20, show=False):
    """Return and optionally print the top n nodes in an attribute
    dictionary.
    """
    nodes = sorted(data_dict.items(), 
                   key=itemgetter(1), 
                   reverse=True)[:limit]
    if show:
        print("Top {} nodes:".format(limit))
        for d in nodes:
            print(d)
        
    return nodes
        
def highest_value(attribute_dict):     
    """Find the node with largest value in an attribute dictionary.
    
    Return: 
        tuple(node, value)
        
    """     
    # Ordered tuple  
    attr_items = [(b,a) for (a,b) in attribute_dict.iteritems()]    
    # Sort in descending order     
    attr_items.sort()     
    attr_items.reverse()     
    return tuple(reversed(attr_items[0]))

In [5]:
def add_directed_attributes(G):
    # ratio of number of graph edges to number of edges if fully connected
    G.graph['density'] = nx.density(G)
    
    # diameter = maximum distance between any pair of nodes
    G.graph['diameter'] = nx.diameter(G)

    
def add_undirected_attributes(G):
    graph = graph.to_undirected()
    
    G.graph['connected_components'] = nx.number_connected_components(graph)
    #G.graph['largest_subgraph'] = max(nx.connected_component_subgraphs(graph), key=len)
    
    clustering_coeffs = nx.clustering(graph)
    G.graph['avg_clust'] = (sum(clustering_coeffs.values()) 
                             / len(clustering_coeffs))
    

### Export

In [6]:
def to_df(G):
    """Export graph to Pandas dataframe with attributes as columns."""

    return pd.DataFrame(dict(G.nodes(data=True))).T
    
def to_file(G, file):
    """Export graph to .gexf file."""
    nx.write_gexf(G, file)
    
def to_txt(G, file):
    """Export node attributes to .txt file."""
    results = [(k, bet_cen[k], clo_cen[k], eig_cen[k]) for k in range(len(nodes))]
    
    f = open(file,'w')
    for item in results:     
        f.write(','.join(map(str,item)))     
        f.write('\n')
    f.close()

def get_matrix(G):
    return nx.to_numpy_matrix(G)

In [7]:
def visualize(G, format_dict=None):
    if format_dict == None:
        format_dict = {'font_size':16,
                       'width':3,
                       'edge_color':'grey',
                       'node_color':'purple',
                       'with_labels':False,
                      }

    fig, ax = plt.subplots(figsize=(10, 8))

    pos = nx.spring_layout(G, k=2)

    nx.draw_networkx(G, 
                     pos,
                     ax=ax,
                     kwargs=format_dict,
                    )
    # Offset labels
    for key, value in pos.items():
        x, y = value[0]+.135, value[1]+.045
        ax.text(x, y,
                s=key,
                bbox=dict(facecolor='red', alpha=0.25),
                horizontalalignment='center', fontsize=13)

## Example: Iran 12/2020 Campaign

In [9]:
users = UsersData('data/users')
tweets = TweetsData('data/tweets')

In [12]:
tweets_df = tweets.df.loc[:][:]
users_df = users.df.loc[:][:]

In [13]:
iran = tweets_df[tweets_df['campaign'] == 'iran202012']
iran_users = users_df[users_df['campaign'] == 'iran202012']

In [17]:
iran.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 560571 entries, 1271764746983952390 to 948848104362663936
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   userid                    560571 non-null  string        
 1   user_display_name         560571 non-null  string        
 2   user_screen_name          560571 non-null  string        
 3   user_reported_location    417523 non-null  string        
 4   user_profile_description  530518 non-null  string        
 5   user_profile_url          338535 non-null  string        
 6   follower_count            560571 non-null  int64         
 7   following_count           560571 non-null  int64         
 8   account_creation_date     560571 non-null  datetime64[ns]
 9   account_language          560571 non-null  string        
 10  tweet_language            444758 non-null  string        
 11  tweet_text                560571 no

In [40]:
rt, re, mentions = get_connections(iran)

In [44]:
def print_graph_properties(graph):
    out = "Nodes: {}".format(graph.number_of_nodes())
    print(out)
    out = "Edges: {}".format(graph.number_of_edges())
    print(out)

    degrees = [val for (node, val) in graph.degree()]

    out = "Maximum degree: {}".format(np.max(degrees))
    print(out)
    out = "Minimum degree: {}".format(np.min(degrees))
    print(out)
    out = "Average degree: {:.1f}".format(np.mean(degrees))
    print(out)
    out = "Mode: {}".format(stats.mode(degrees)[0][0])
    print(out)
    
print_graph_properties(rt)

Nodes: 92
Edges: 319
Maximum degree: 26
Minimum degree: 1
Average degree: 6.9
Mode: 2


In [57]:
add_node_attributes(rt)

In [None]:
def top_nodes(data_dict, limit=20, show=False):
    """Return and optionally print the top n nodes in an attribute
    dictionary.
    """
    nodes = sorted(data_dict.items(), 
                   key=itemgetter(1), 
                   reverse=True)[:limit]
    if show:
        print("Top {} nodes:".format(limit))
        for d in nodes:
            print(d)
        
    return nodes

In [80]:
print('Total retweets: {}'.format(len(iran[iran['is_retweet'] == True])))

Total retweets: 100446


In [82]:
print('Retweets with userid: {}'. format(len(iran[iran['retweet_userid'].notna()])))

Retweets with userid: 17981


In [69]:
iran.groupby([['is_retweet','userid']]).size()

userid
+JkWMulEtCyTrcFDRO2XLv9EOdGHDl0GB9cdZUWgtA=        78
+fwTi4Wv1fs5sua3wZXqtBWBMMAy5IKNd5euWlP8Kuk=       95
0gTQ2cDCHFpYXKO+G367F1HBrPLupiuPjXuvmp9UL+w=      456
0hVjtURHlBEHZhn22rNDf98r+8VUXV3gi1bxvAhrZo=      1566
0zCl5U0pYu0gEmK3JtjO5fbnxEj6pO9GUgH52Q6yg0E=    13304
                                                ...  
y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=       526
z3nCVBEHiIbcBhhxU2mOz5iWK4a7sUdGmRSPFM16G0=      7878
zFlH+vHUhiZD2qvvCLYyiU76qOha9+iYxCn1NVmzw=         41
zTUtu8WZ3RwxnwgMsYXnTU107UXsn4MQU5wrg8IDOU=         5
zk4khaX7A3XhXVndteeiXLe4ma8xR7bYMBCOhCt68j8=        1
Name: is_retweet, Length: 209, dtype: int64

In [85]:
most_retweets = top_nodes(nx.get_node_attributes(rt, 'out_degree'),limit=20)
most_retweets

[('79Tf6XH3DwjdUWGO4aQWghSj5G2esetBnePoOBB3wYM=', 15),
 ('z3nCVBEHiIbcBhhxU2mOz5iWK4a7sUdGmRSPFM16G0=', 14),
 ('BYaaZkKxjVQjhsnn9REZ0UcFoEHl+tKnzJ+0Hv+Pg=', 13),
 ('v6groR3jb3Pkm5X9ccSwgoPnmlZzKEkx5bsc1XQHb0=', 13),
 ('6qhrzJLryTiE7VlJkmY+cKE5VsITiaFwMA7s3Dr5I=', 12),
 ('rrkP3RNDiwWqLySjcoqdFlz7DrFX7RCwwDSxyDqaU=', 11),
 ('y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=', 10),
 ('KDPuEWEH9vUzPJJYOWrnRKwAkD475uTOwGgDadl13M=', 10),
 ('qb7oI6mpyssEK1+AulE9g7rOChgVTfVOIDhXM8tA=', 10),
 ('hkHAOHxXGt95jKyclLf70qjOMCdw3E7RyP1EYC0pYJU=', 10),
 ('Bl9IUxp6GStTkiNGbhfBJM9xu85e3Y9BQdNd97GFAI=', 9),
 ('Hz3n1y2y3sLOknEGxPWczYyCSeZrz15JLPTcJAd6oAo=', 9),
 ('QXyUcPxvoQ+uzkqCj6O+AT9nUf3Avh0JTvwTG9GncQk=', 9),
 ('ihPCMQ32xEzpD35Et9IH4HO21XKiWdSJreVg+pHT5o=', 9),
 ('A2ufeZxtC32uuqgSKfm1EiJPs4uMP3aM7AfHUMsTM=', 8),
 ('etDaWEjMPleueDrpkatPUSCApc6yU8W95+yZYWzVxSY=', 8),
 ('LubegkQHPmV20BdQonItq03iWY5unAkvlI2M7PNKvjI=', 8),
 ('1099221870530961408', 8),
 ('1067814896706994176', 7),
 ('mpY+AZI80Nu61VS3o4Zm+UXTjaxr

In [84]:
most_retweeted = top_nodes(nx.get_node_attributes(rt, 'in_degree'),limit=20)
most_retweeted

[('Bl9IUxp6GStTkiNGbhfBJM9xu85e3Y9BQdNd97GFAI=', 16),
 ('etDaWEjMPleueDrpkatPUSCApc6yU8W95+yZYWzVxSY=', 16),
 ('1067814896706994176', 15),
 ('BYaaZkKxjVQjhsnn9REZ0UcFoEHl+tKnzJ+0Hv+Pg=', 13),
 ('rrkP3RNDiwWqLySjcoqdFlz7DrFX7RCwwDSxyDqaU=', 13),
 ('v6groR3jb3Pkm5X9ccSwgoPnmlZzKEkx5bsc1XQHb0=', 12),
 ('79Tf6XH3DwjdUWGO4aQWghSj5G2esetBnePoOBB3wYM=', 11),
 ('M4AN5Ed68gBwmVS3DcPbwuXv7cJRYDwsCvmJ84BYSB0=', 11),
 ('0gTQ2cDCHFpYXKO+G367F1HBrPLupiuPjXuvmp9UL+w=', 11),
 ('1099221870530961408', 11),
 ('mpY+AZI80Nu61VS3o4Zm+UXTjaxrkE08xU0nO3JQ=', 10),
 ('6qhrzJLryTiE7VlJkmY+cKE5VsITiaFwMA7s3Dr5I=', 10),
 ('y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=', 10),
 ('XCVp3AesS42sNbyxhBsKM62AylqUJyipi3laS53gY=', 10),
 ('EOkiPW6kT+GE5A2wU5mqPe7hiK8s7Cr0tfCybFsz2Sg=', 8),
 ('z3nCVBEHiIbcBhhxU2mOz5iWK4a7sUdGmRSPFM16G0=', 7),
 ('ihPCMQ32xEzpD35Et9IH4HO21XKiWdSJreVg+pHT5o=', 7),
 ('HcrkAj2Z5laESkAd3aQjE1dgoP6r9xDN6LLzEIHzX4=', 7),
 ('qb7oI6mpyssEK1+AulE9g7rOChgVTfVOIDhXM8tA=', 7),
 ('Llwo+0XebgvqnGE1UiwiaJqfQWQ

In [89]:
print([userid for userid in most_retweeted if userid in most_retweets])

[('BYaaZkKxjVQjhsnn9REZ0UcFoEHl+tKnzJ+0Hv+Pg=', 13), ('y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=', 10)]


### Clustering and transitivity

Measuring the tendency for nodes to cluster together or for edges to form triangles: this corresponds to measures of the extent to which the users interacting with one particular user tend to interact with each other as well. Transitivity weights nodes with a large degree higher.

The clustering coefficient is calculated as the number of triangles connected to node $i$ divided by the number of sets of two edges connected to node $i$ (node triples). 

The transitivity coefficient is calculated as 3 times the number of triangles in the network, divided by the number of connected triples of nodes in the network.

- insight into how users tend to create groups characterized by dense connections

In [20]:
out = "Average clustering coefficient: {}".format(nx.average_clustering(graph))
print(out)
out = "Transitivity: {}".format(nx.transitivity(graph))
print(out)

Average clustering coefficient: 0.3277545397909848
Transitivity: 0.676875


### Centrality

Capture the importance of a node's position in the network considering: 
- degree, on the assumption that an important node will have many connections,
- closeness, on the assumption that important nodes are close to other nodes, and
- betweenness, on the assumption that important nodes are well situated and connect other nodes.

In [23]:
graph_centrality = nx.degree_centrality(graph)
max_de = max(graph_centrality.items(), key=itemgetter(1))
graph_closeness = nx.closeness_centrality(graph)
max_clo = max(graph_closeness.items(), key=itemgetter(1))
graph_betweenness = nx.betweenness_centrality(graph, normalized=True, endpoints=False)
max_bet = max(graph_betweenness.items(), key=itemgetter(1))

out = "Max degree centrality: {:.2f}, for node {}".format(max_de[1], max_de[0])
print(out)
out = "Max closeness centrality: {:.2f}, for node {}".format(max_clo[1], max_clo[0])
print(out)
out = "Max betweenness centrality: {:.2f}, for node {}".format(max_bet[1], max_bet[0])
print(out)

Max degree centrality: 0.21, for node v6groR3jb3Pkm5X9ccSwgoPnmlZzKEkx5bsc1XQHb0=
Max closeness centrality: 0.22, for node y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=
Max betweenness centrality: 0.16, for node y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=
