Reference:

John R. Ladd, Jessica Otis, Christopher N. Warren, and Scott Weingart, "Exploring and Analyzing Network Data with Python," The Programming Historian 6 (2017), https://doi.org/10.46430/phen0064.

https://programminghistorian.org/en/lessons/exploring-and-analyzing-network-data-with-python

In [15]:
import datetime
import functools
import itertools
from operator import itemgetter
import os
from pathlib import Path
import re

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from scipy import stats
import seaborn as sns

from utils import *

%matplotlib notebook

## Build User Graphs

In [7]:
def get_connections(tweets):
    """Generate three sets of connections ((user1, user2), tweetid):
        1. Retweeted: ((['userid'], ['retweet_userid'])['tweetid'])
        2. Replied: ((['userid'], ['in_reply_to_userid']), ['tweetid'])
        3. Mentioned: ((['userid'], ['mentioned_userid']), ['tweetid'])
    """
    rt = tweets[['userid', 'retweet_userid']].dropna().reset_index()
    re = tweets[['userid', 'in_reply_to_userid']].dropna().reset_index()
    
    mentions = (tweets['user_mentions']
                # replace empty braces with NaN and drop
                .replace('[]', np.nan)
                .dropna()
                # expand usernames into columns
                .str.replace(r"[\[\]\']", "", regex=True)
                .str.split(',', expand=True)
                # melt wide table into duplicated tweets
                .reset_index()
                .melt(id_vars=['tweetid'],
                      value_name='mentioned_userid')
                # clean up
                .astype({'mentioned_userid':'string'})
                .drop(columns=['variable'])
                .join(tweets['userid'], on='tweetid')
                .dropna()
               )
    
    rt_graph = nx.from_pandas_edgelist(rt, 
                                       source='userid', 
                                       target='retweet_userid', 
                                       edge_attr='tweetid',
                                       create_using=nx.DiGraph(),
                                      )
    re_graph = nx.from_pandas_edgelist(re, 
                                       source='userid', 
                                       target='in_reply_to_userid', 
                                       edge_attr='tweetid',
                                       create_using=nx.DiGraph(),
                                      )
    mention_graph = nx.from_pandas_edgelist(mentions, 
                                           source='userid', 
                                           target='mentioned_userid', 
                                           edge_attr='tweetid',
                                           create_using=nx.DiGraph(),
                                           )
    return rt_graph, re_graph, mention_graph

### Centrality

Capture the importance of a node's position in the network considering: 
- degree, on the assumption that an important node will have many connections,
- closeness, on the assumption that important nodes are close to other nodes, and
- betweenness, on the assumption that important nodes are well situated and connect other nodes.

In [5]:
def add_node_attributes(G):
    """Compute various graph metrics and add to node attributes.
    """
    # degree
    nx.set_node_attributes(G,
                           dict(G.out_degree(G.nodes())), 
                           'out_degree')
    nx.set_node_attributes(G,
                           dict(G.in_degree(G.nodes())), 
                           'in_degree')
    
    # eigenvector centrality
    nx.set_node_attributes(G, 
                           nx.eigenvector_centrality(G),
                           'eigenvector')
    
    # betweenness centrality
    nx.set_node_attributes(G,
                           nx.betweenness_centrality(G), 
                           'betweenness')
    
    # degree centrality
    nx.set_node_attributes(G,
                           nx.degree_centrality(re),
                           'degree_centrality')
    

In [1]:
def top_nodes(data_dict, limit=20, show=False):
    """Return and optionally print the top n nodes in an attribute
    dictionary.
    """
    nodes = sorted(data_dict.items(), 
                   key=itemgetter(1), 
                   reverse=True)[:limit]
    if show:
        print("Top {} nodes:".format(limit))
        for d in nodes:
            print(d)
        
    return nodes
        
def highest_value(attribute_dict):     
    """Find the node with largest value in an attribute dictionary.
    
    Return: 
        tuple(node, value)
        
    """     
        # Ordered tuple  
        attr_items = [(b,a) for (a,b) in attribute_dict.iteritems()]    
        # Sort in descending order     
        attr_items.sort()     
        attr_items.reverse()     
    return tuple(reversed(attr_items[0]))

### Clustering and transitivity

Measuring the tendency for nodes to cluster together or for edges to form triangles: this corresponds to measures of the extent to which the users interacting with one particular user tend to interact with each other as well. Transitivity weights nodes with a large degree higher.

The clustering coefficient is calculated as the number of triangles connected to node $i$ divided by the number of sets of two edges connected to node $i$ (node triples). 

The transitivity coefficient is calculated as 3 times the number of triangles in the network, divided by the number of connected triples of nodes in the network.

- insight into how users tend to create groups characterized by dense connections

In [None]:
def add_directed_attributes(G):
    # ratio of number of graph edges to number of edges if fully connected
    G.graph['density'] = nx.density(G)
    
    # diameter = maximum distance between any pair of nodes
    G.graph['diameter'] = nx.diameter(G)

    
def add_undirected_attributes(G):
    graph = graph.to_undirected()
    
    G.graph['connected_components'] = nx.number_connected_components(graph)
    #G.graph['largest_subgraph'] = max(nx.connected_component_subgraphs(graph), key=len)
    
    clustering_coeffs = nx.clustering(graph)
    G.graph['avg_clust'] = (sum(clustering_coeffs.values()) 
                             / len(clustering_coeffs))
    

In [None]:
def to_df(G):
    """Export graph to Pandas dataframe with attributes as columns."""

    return pd.DataFrame(dict(G.nodes(data=True))).T
    
def to_file(G, file):
    """Export graph to .gexf file."""
    nx.write_gexf(G, file)
    
def to_txt(G, file):
    """Export node attributes to .txt file."""
    results = [(k, bet_cen[k], clo_cen[k], eig_cen[k]) for k in range(len(nodes))]
    
    f = open(file,'w')
    for item in results:     
        f.write(','.join(map(str,item)))     
        f.write('\n')f.close()

def get_matrix(G):
    return nx.to_numpy_matrix(G)

In [13]:
def visualize(G, format_dict=None):
    if format_dict == None:
        format_dict = {'font_size':16,
                       'width':3,
                       'edge_color':'grey',
                       'node_color':'purple',
                       'with_labels':False,
                      }

    fig, ax = plt.subplots(figsize=(10, 8))

    pos = nx.spring_layout(G, k=2)

    nx.draw_networkx(G, 
                     pos,
                     ax=ax,
                     kwargs=format_dict,
                    )
    # Offset labels
    for key, value in pos.items():
        x, y = value[0]+.135, value[1]+.045
        ax.text(x, y,
                s=key,
                bbox=dict(facecolor='red', alpha=0.25),
                horizontalalignment='center', fontsize=13)

## Example: Iran 12/2020 Campaign

In [3]:
users = UsersData('../data/users')
tweets = TweetsData('../data/tweets')

In [6]:
df = tweets.df.loc[:][:]
tweets_df = df[df['campaign'] == 'iran202012']
rt, re, mentions = get_connections(tweets_df)

In [26]:
graph = rt

In [18]:
out = "Nodes: {}".format(graph.number_of_nodes())
print(out)
out = "Edges: {}".format(graph.number_of_edges())
print(out)

degrees = [val for (node, val) in graph.degree()]

out = "Maximum degree: {}".format(np.max(degrees))
print(out)
out = "Minimum degree: {}".format(np.min(degrees))
print(out)
out = "Average degree: {:.1f}".format(np.mean(degrees))
print(out)
out = "Mode: {}".format(stats.mode(degrees)[0][0])
print(out)

Nodes: 92
Edges: 243
Maximum degree: 19
Minimum degree: 1
Average degree: 5.3
Mode: 2


In [20]:
out = "Average clustering coefficient: {}".format(nx.average_clustering(graph))
print(out)
out = "Transitivity: {}".format(nx.transitivity(graph))
print(out)

Average clustering coefficient: 0.3277545397909848
Transitivity: 0.676875


In [23]:
graph_centrality = nx.degree_centrality(graph)
max_de = max(graph_centrality.items(), key=itemgetter(1))
graph_closeness = nx.closeness_centrality(graph)
max_clo = max(graph_closeness.items(), key=itemgetter(1))
graph_betweenness = nx.betweenness_centrality(graph, normalized=True, endpoints=False)
max_bet = max(graph_betweenness.items(), key=itemgetter(1))

out = "Max degree centrality: {:.2f}, for node {}".format(max_de[1], max_de[0])
print(out)
out = "Max closeness centrality: {:.2f}, for node {}".format(max_clo[1], max_clo[0])
print(out)
out = "Max betweenness centrality: {:.2f}, for node {}".format(max_bet[1], max_bet[0])
print(out)

Max degree centrality: 0.21, for node v6groR3jb3Pkm5X9ccSwgoPnmlZzKEkx5bsc1XQHb0=
Max closeness centrality: 0.22, for node y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=
Max betweenness centrality: 0.16, for node y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=


In [32]:
def node_dict_scatter(dict1,
                      dict2,
                      path="",
                      ylabel="",
                      xlabel="",
                      title="",
                      line=False):
    
    fig = plt.figure(figsize=(7,7))    
    ax = fig.add_subplot(111)    
      
    items1 = sorted(dict1.items())    
    items2 = sorted(dict2.items())    
    xdata = [b for a,b in items1]    
    ydata = [b for a,b in items2]     
    
    for p in range(len(items1)):        
        ax.text(x=xdata[p], 
                y=ydata[p],
                s=str(items1[p][0]), 
                color="b")
        
    if line:         
        # use NumPy to calculate the best fit        
        slope, yint = plt.polyfit(xdata,ydata,1)        
        xline = plt.xticks()[0]         
        yline = map(lambda x: slope*x+yint,xline)        
        ax.plot(xline, yline, ls='--',color='b')
        
        # Set new x- and y-axis limits    
        plt.xlim((0.0, max(xdata)+(.15*max(xdata))))    
        plt.ylim((0.0, max(ydata)+(.15*max(ydata))))
        
        # Add labels and save    
        ax.set_title(title)    
        ax.set_xlabel(xlab)     
        ax.set_ylabel(ylab)     
        plt.savefig(path)

In [None]:
dict1 = dict(graph.in_degree(graph.nodes()))
dict2 = dict(graph.out_degree(graph.nodes()))

#node_dict_scatter(dict1, dict2)