# The Social Network

In [7]:
import pandas as pd
import numpy as np
import networkx as nx
import community
import matplotlib.pyplot as plt
from operator import itemgetter

## Building a directed network

In [8]:
df = pd.read_csv('all_tweets.csv',sep=',')

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
df.text.iloc[1]

'RT @CanadaFP: Canada announces support for #elections and #democracy in #Ukraine. https://t.co/KntgUdxg1N https://t.co/3DUXdERHBR'

In [10]:
df = df[df.language == 'en']

In [11]:
print(min(df.date))
print(max(df.date))

2018-12-05 06:36:02
2018-12-06 20:36:00


Subset data to add nodes to network

In [12]:
nodes_retweets = df.loc[df.retweet_user_name.notna(),['user_name','retweet_user_name']]
nodes_quotes = df.loc[df.quoted_user_name.notna(),['user_name','quoted_user_name']]
nodes_reply = df.loc[df.reply_user_name.notna(),['user_name','reply_user_name']]

In [13]:
nodes_retweets.shape

(198221, 2)

In [14]:
df_RTs = nodes_retweets.groupby(['user_name','retweet_user_name']).agg({'retweet_user_name':'count'})
df_RTs = df_RTs.rename(columns={'retweet_user_name':'RT_count'})
df_RTs.reset_index(inplace=True)

### Network graph connected only by retweets

In [15]:
N = nx.DiGraph()
e = zip(df_RTs.user_name,df_RTs.retweet_user_name,df_RTs.RT_count.apply(lambda x: {'weight':x}))
# e = zip(df_RTs.user_name,df_RTs.retweet_user_name)
N.add_edges_from(e)

# remove self RTs
N.remove_edges_from(N.selfloop_edges())

### Community finding

In [16]:
partition = community.best_partition(N.to_undirected())

In [152]:
# extracting community info and putting them into a dataframe
values = [partition.get(node) for node in N.nodes()]
node_names = [node for node in N.nodes()]
print(len(set(values)))
temp_community = pd.DataFrame(node_names,values).reset_index()
temp_community.columns = ['community','user_name']

1638


In [None]:
# joining to the original dataframe
df = pd.merge(df, temp_community,how='left', on='user_name')
# saving to csv
df.to_csv('all_tweets_communities.csv')

Networks with high modularity have dense connections between the nodes within modules but sparse connections between nodes in different modules.

In [153]:
community.modularity(partition,N.to_undirected())

0.9563281279102909

A closely connected social community will imply a faster rate of transmission of information. Communities are defined as groups of densely interconnected nodes that are only sparsely connected with the rest of the network. 

Hence, it may be imperative to identify the communities in networks since the communities may have quite different properties such as node degree, clustering coefficient, betweenness, centrality, etc., from that of the average network.

In [80]:
pr = nx.pagerank(N)

# Save PageRank and Community Membership to the data dict for each node
for n, d in N.nodes(data=True):
    d['pagerank'] = pr[n]
    d['community'] = partition[n]

In [81]:
nx.write_gexf(N,'RT_complete_network.gexf')

#############################################################################

### Subgraphs and Strongly Connected Components

In [154]:
# Getting the big component
N_subgraphs = sorted(nx.connected_component_subgraphs(N.to_undirected()), key=len, reverse=True)
print('Size of largest connected components:', [len(g) for g in N_subgraphs[:20]])

Size of largest connected components: [13319, 91, 84, 67, 64, 61, 58, 55, 50, 50, 43, 40, 38, 27, 24, 24, 24, 23, 23, 23]


In [155]:
# Determine which community each user is in
partition = community.best_partition(N_subgraphs[0].to_undirected())

In [156]:
# extracting community info and putting them into a dataframe
values = [partition.get(node) for node in N.nodes()]
node_names = [node for node in N.nodes()]
print(len(set(values)))
temp_community = pd.DataFrame(node_names,values).reset_index()
temp_community.columns = ['community','user_name']

70


In [157]:
community.modularity(partition,N_subgraphs[0].to_undirected())

0.9334781417094957

In [158]:
# getting the part of N that has the most nodes
N_conn_comp = sorted(nx.strongly_connected_components(N),key=len, reverse=True)
print('Size of largest strongly connected components:', [len(g) for g in N_conn_comp[:20]])

Size of largest strongly connected components: [4, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1]


In [159]:
# Create graph of largest strongly connected component
# S = nx.subgraph(N, N_conn_comp[0])
S = N_subgraphs[0]
pr = nx.pagerank(S)

# Save PageRank and Community Membership to the data dict for each node
for n, d in S.nodes(data=True):
    d['pagerank'] = pr[n]
    d['community'] = partition[n]
    #d['name'] = user_data_dict[n]['name']
    #d['description'] = user_data_dict[n]['description']
    #d['followers_count'] = user_data_dict[n]['followers_count']
    #d['total_rts'] = user_data_dict[n]['total_rts']

In [160]:
nx.write_gexf(S,'RT_big_component.gexf')

#############################################################################