# Community detection

In [2]:
!pip install python-louvain
!pip install scikit-network



In [1]:
from community import community_louvain
import networkx as nx
from networkx import edge_betweenness_centrality as betweenness
from networkx.algorithms import community
import pandas as pd
import re
import numpy as np
from operator import itemgetter
import itertools

In [18]:
def network_from_edges(edges):
  dict_edges = {}
  for ind in range(len(edges)):
    if edges.iloc[ind,2] > 10:
      dict_edges[(edges.iloc[ind,0], edges.iloc[ind,1])] = edges.iloc[ind,2]
  net = pd.DataFrame.from_dict(dict_edges,orient='index')
  net.columns = ["weight"]
  net.sort_values(by="weight",inplace=True, ascending=False)
  return dict_edges, net


def get_graph(network):
  G = nx.Graph()
  for edge in network:
      #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
      G.add_edge(edge[0], edge[1], weight=network[edge])
  
  #G.add_weighted_edges_from(up_weighted, weight='weight')
  return G


def extract_text(df):       #extract the text from the tweets and RT
                            #works ONLY on .csv file
  list_strings = []
  for index in range(len(df)):
    text = df.loc[index]['text']                          #if it is nor trucated nor a RT  i take "text"
    string = -1
    if (df.loc[index,"truncated"] == True):                 #if it is trucated I take "extended_tweet"
        string = df.loc[index,"extended_tweet"]
    if type(df.loc[index,"retweeted_status"]) != float:     #if it is a RT I take retweeted_status
        string = df.loc[index,"retweeted_status"]
    if type(string) == str :
        if(re.search('full_text\':(.+?)https',string) != None):     #if I find "full_text"
          s = re.search('full_text\':(.+?)https',string).group(1)
        if(re.search('text\':(.+?)https',string)!= None):
          s = re.search('text\':(.+?)https',string).group(1)
        else: 
          continue
        list_strings.append(s)
    else:
      list_strings.append(text)

  return list_strings


def frequency_dictionary(df):
  unique_words = {}

  for row in df:
    for word in row.split():
      #if the word is encountered for the first time add to dict as key and set its value to 0
      unique_words.setdefault(word,0)
      #increase the value (i.e the count) of the word by 1 every time it is encountered
      unique_words[word] += 1

  return unique_words


def adj_matrix(node_list, edge_list):   

  words = sorted(list(node_list['Id']))

  A = pd.DataFrame(np.zeros((len(words),len(words))), columns = words, index = words)
  print('A shape: ', A.shape)
  for row in range(len(edge_list)):
    if row%1000==0:
      print(str(round(row/1000)) + '/' + str(round(len(edge_list)/1000)))
    word1 = edge_list.iloc[row, 0]
    word2 = edge_list.iloc[row, 1]
    weight = edge_list.iloc[row, 2]
    A[word1, word2] = weight
    A[word2, word1] = weight
  print('A shape: ', A.shape)
  return A

In [3]:
def community_check(community, dict_1, dict_2, G_1, G_2, thr):
  sum_1 = 0
  sum_2 = 0
  sum_12 = 0
  for key in community:
    if (key in list(dict_1.keys())) and (key in list(dict_2.keys())):
      if dict_1[key] / sum(list(dict_1.values())) - dict_2[key] / sum(list(dict_2.values())) > thr:
        sum_1 += 1
      elif dict_1[key] / sum(list(dict_1.values())) - dict_2[key] / sum(list(dict_2.values())) < -thr:
        sum_2 += 1  
      else:
        sum_12 += 1 
    elif key in list(G_1.nodes()):
      sum_1 += 1
    elif key in list(G_2.nodes()):
      sum_2 += 1
  return sum_1, sum_2, sum_12

# PageRank Networks

## All periods

In [4]:
period = ''
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')

edges = pd.read_csv('/content/edgelist_China_USA'+period+'.csv')
# edges_China = pd.read_csv('/content/edgelist_China'+period+'.csv')
# edges_USA = pd.read_csv('/content/edgelist_USA'+period+'.csv')

nodes = pd.read_csv('/content/nodelist_China_USA'+period+'.csv')

text_China = extract_text(China)
text_USA = extract_text(USA)

freq_dict_China = frequency_dictionary(text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

net, net_df = network_from_edges(edges)
# net_China, net_df_China = network_from_edges(edges_China)
# net_USA, net_df_USA = network_from_edges(edges_USA)

G = get_graph(net)
# G_China = get_graph(net_China)
# G_USA = get_graph(net_USA)

In [5]:
print('China&USA:')
print('Nodes: ',len(G.nodes()))
print('Edges: ',len(G.edges()))
print('Is connected: ',nx.is_connected(G))
print()
'''
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()
'''

China&USA:
Nodes:  509
Edges:  11982
Is connected:  True



"\nprint('China:')\nprint('Nodes: ',len(G_China.nodes()))\nprint('Edges: ',len(G_China.edges()))\nprint('Is connected: ',nx.is_connected(G_China))\nprint()\nprint('USA:')\nprint('Nodes: ',len(G_USA.nodes()))\nprint('Edges: ',len(G_USA.edges()))\nprint('Is connected: ',nx.is_connected(G_USA))\nprint()\n"

### Girvan_Newman

In [7]:
community_gn = community.girvan_newman(G,)
for c_list in community_gn:
  tuple(print(sorted(c)) for c in c_list)

['accord', 'across', 'add', 'administer', 'administration', 'adult', 'adviser', 'affect', 'africa', 'age', 'agency', 'ahead', 'aid', 'air', 'allow', 'almost', 'already', 'also', 'america', 'amid', 'among', 'analysis', 'announce', 'another', 'antibody', 'appear', 'approve', 'area', 'around', 'arrive', 'asian', 'ask', 'australia', 'authority', 'avoid', 'back', 'bad', 'ban', 'bank', 'base', 'batch', 'battle', 'become', 'begin', 'behind', 'beijing', 'benefit', 'big', 'bill', 'billion', 'blood', 'boost', 'brazil', 'break', 'breaking', 'bring', 'british', 'build', 'business', 'california', 'call', 'campaign', 'canada', 'cancel', 'candidate', 'capital', 'care', 'case', 'catch', 'cause', 'center', 'central', 'challenge', 'change', 'chief', 'child', 'china', 'citizen', 'city', 'claim', 'clinical', 'close', 'clot', 'combat', 'come', 'commission', 'committee', 'community', 'company', 'concern', 'condition', 'confidence', 'confirm', 'confirmed', 'contact', 'contain', 'continue', 'contract', 'contr

KeyboardInterrupt: ignored

In [None]:
# VERY HEAVY COMPUTATION
def most_central_edge(G):
  centrality = betweenness(G, weight="weight")
  return max(centrality, key=centrality.get)

community_gn_centrality = community.girvan_newman(G, most_valuable_edge=most_central_edge)
tuple(print(sorted(c)) for c in next(community_gn_centrality))

In [8]:
def heaviest(G):
  u, v, w = max(G.edges(data="weight"), key=itemgetter(2))
  return (u, v)

community_gn_heaviest = community.girvan_newman(G, most_valuable_edge=heaviest)
tuple(print(sorted(c)) for c in next(community_gn_heaviest))

['accord', 'across', 'add', 'administer', 'administration', 'adult', 'adviser', 'affect', 'africa', 'age', 'agency', 'ahead', 'aid', 'air', 'allow', 'almost', 'already', 'also', 'america', 'amid', 'among', 'analysis', 'announce', 'another', 'antibody', 'appear', 'approve', 'area', 'around', 'arrive', 'asian', 'ask', 'australia', 'authority', 'avoid', 'back', 'bad', 'ban', 'bank', 'base', 'batch', 'battle', 'become', 'begin', 'behind', 'beijing', 'benefit', 'big', 'bill', 'billion', 'blood', 'boost', 'brazil', 'break', 'breaking', 'bring', 'british', 'build', 'business', 'california', 'call', 'campaign', 'canada', 'cancel', 'candidate', 'capital', 'care', 'case', 'catch', 'cause', 'center', 'central', 'challenge', 'change', 'chief', 'child', 'china', 'citizen', 'city', 'claim', 'clinical', 'close', 'clot', 'combat', 'come', 'commission', 'committee', 'community', 'company', 'concern', 'condition', 'confidence', 'confirm', 'confirmed', 'contact', 'contain', 'continue', 'contract', 'contr

(None, None)

### Bipartition
Partition a graph into two blocks using the Kernighan–Lin algorithm.

In [None]:
community_bipart = community.kernighan_lin_bisection(G,weight='weight')

In [None]:
print(community_bipart[0])
print(community_bipart[1])

{'medium', 'crisis', 'read', 'major', 'human', 'evacuate', 'agency', 'korea', 'ban', 'mike', 'suspend', 'drop', 'urge', 'post', 'free', 'ahead', 'seek', 'research', 'within', 'secretary', 'without', 'push', 'visit', 'committee', 'track', 'industry', 'italy', 'challenge', 'special', 'batch', 'right', 'mark', 'california', 'move', 'already', 'group', 'breaking', 'life', 'critical', 'hard', 'growth', 'catch', 'try', 'delay', 'billion', 'school', 'street', 'force', 'want', 'student', 'win', 'nominee', 'former', 'relief', 'several', 'combat', 'talk', 'small', 'pneumonia', 'democratic', 'affect', 'diagnosis', 'reopen', 'court', 'seven', 'bank', 'today', 'slow', 'confidence', 'order', 'link', 'adviser', 'worry', 'great', 'donate', 'benefit', 'likely', 'debate', 'stand', 'federal', 'governor', 'boost', 'diagnose', 'potential', 'avoid', 'yet', 'build', 'turn', 'issue', 'decline', 'almost', 'penny', 'condition', 'scientist', 'break', 'war', 'express', 'base', 'employee', 'young', 'north', 'share

In [None]:
threshold = 0.00001
sum_China_0, sum_USA_0, sum_China_USA_0 = community_check(community_bipart[0], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
sum_China_1, sum_USA_1, sum_China_USA_1 = community_check(community_bipart[1], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)

In [None]:
print('First community:')
print('China: ', round(100 * sum_China_0 / len(community_bipart[0])),'%')
print('USA: ', round(100 * sum_USA_0 / len(community_bipart[0])),'%')
print('China&USA: ', round(100 * sum_China_USA_0 / len(community_bipart[0])),'%')
print()
print('China: ', round(100 * sum_China_1 / len(community_bipart[1])),'%')
print('USA: ', round(100 * sum_USA_1 / len(community_bipart[1])),'%')
print('China&USA: ', round(100 * sum_China_USA_1 / len(community_bipart[1])),'%')

First community:
China:  28 %
USA:  63 %
China&USA:  10 %

China:  39 %
USA:  53 %
China&USA:  7 %


### Modularity-based communities
Find communities in G using greedy modularity maximization.

In [None]:
community_mod = community.greedy_modularity_communities(G, n_communities=2, weight='weight')
# communities_naive_mod = community.naive_greedy_modularity_communities(G)

In [None]:
community_mod_list = []
for comm in community_mod:
 print(list(comm))
 community_mod_list.append(list(comm))

['medium', 'crisis', 'medical', 'disease', 'read', 'fear', 'human', 'major', 'ban', 'evacuate', 'outside', 'suspend', 'outbreak', 'visit', 'amid', 'industry', 'challenge', 'special', 'already', 'life', 'critical', 'travel', 'growth', 'catch', 'try', 'billion', 'street', 'force', 'win', 'big', 'patient', 'measure', 'novel', 'combat', 'contain', 'pneumonia', 'affect', 'government', 'bank', 'prevention', 'confidence', 'order', 'effort', 'great', 'economy', 'epidemic', 'around', 'china', 'stand', 'avoid', 'build', 'control', 'help', 'global', 'almost', 'die', 'break', 'war', 'treat', 'express', 'employee', 'call', 'prevent', 'curb', 'hospital', 'change', 'worker', 'late', 'look', 'world', 'recover', 'city', 'holiday', 'demand', 'place', 'stop', 'leave', 'support', 'old', 'treatment', 'impact', 'shut', 'development', 'coronavirus', 'full', 'financial', 'much', 'family', 'video', 'live', 'update', 'hundred', 'beijing', 'join', 'international', 'lead', 'infect', 'care', 'close', 'social', 'st

In [None]:
threshold = 0.00001
for i in range(len(community_mod_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_mod_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_mod_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_mod_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_mod_list[i])),'%')
  print()

Community:  0
China:  44 %
USA:  48 %
China&USA:  9 %

Community:  1
China:  22 %
USA:  68 %
China&USA:  11 %

Community:  2
China:  23 %
USA:  70 %
China&USA:  7 %

Community:  3
China:  46 %
USA:  46 %
China&USA:  7 %

Community:  4
China:  50 %
USA:  25 %
China&USA:  25 %



### Louvain Community Detection
Find the best partition of a graph using the Louvain Community Detection Algorithm.

In [None]:
# communities_louvain = community.louvain.louvain_communities(G)
# partitions_louvain = community.louvain.louvain_partitions(G)

community_louv = community_louvain.best_partition(G,weight='weight')

# modularity_louvain = community.modularity(communities_louvain, G) # ERROR: not a partition
# print("The modularity Q based on networkx is {}".format(modularity_louvain))

In [None]:
max = np.max(list(community_louv.values()))
community_louvain_list = []
for i in range(max+1):
  community_louvain_list.append([])
for key in list(community_louv.keys()):
  community_louvain_list[community_louv[key]].append(key)
for i in range(max+1):
  print(community_louvain_list[i])

['disease', 'prevention', 'control', 'south', 'center', 'korea', 'africa', 'north']
['china', 'coronavirus', 'novel', 'outbreak', 'global', 'continue', 'people', 'world', 'virus', 'challenge', 'face', 'expert', 'human', 'pandemic', 'medium', 'watch', 'great', 'take', 'health', 'organization', 'lead', 'international', 'cruise', 'ship', 'japan', 'live', 'worker', 'nation', 'line', 'central', 'epicenter', 'late', 'support', 'help', 'fight', 'express', 'contain', 'epidemic', 'try', 'effort', 'send', 'amid', 'foreign', 'mask', 'almost', 'work', 'service', 'industry', 'combat', 'business', 'issue', 'transmission', 'update', 'around', 'raise', 'development', 'die', 'measure', 'home', 'city', 'fund', 'battle', 'billion', 'win', 'due', 'big', 'beijing', 'pneumonia', 'economy', 'many', 'stand', 'infect', 'family', 'spread', 'protect', 'already', 'production', 'join', 'year', 'old', 'also', 'bank', 'affect', 'growth', 'far', 'evacuate', 'call', 'despite', 'become', 'full', 'keep', 'citizen', 'con

In [None]:
threshold = 0.00001
for i in range(len(community_louvain_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_louvain_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_louvain_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_louvain_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_louvain_list[i])),'%')
  print()

Community:  0
China:  75 %
USA:  12 %
China&USA:  12 %

Community:  1
China:  38 %
USA:  52 %
China&USA:  10 %

Community:  2
China:  33 %
USA:  62 %
China&USA:  5 %

Community:  3
China:  47 %
USA:  46 %
China&USA:  7 %

Community:  4
China:  20 %
USA:  69 %
China&USA:  10 %



### Clique percolation

In [9]:
cliques = nx.find_cliques(G)
# in our case cliques correspond to tweets

In [10]:
k = 7
community_clique = community.k_clique_communities(G, k, cliques)

In [None]:
c = next(community_clique)

### Dendogram

In [None]:
from sknetwork.hierarchy import Paris
paris = Paris()
adjacency = adj_matrix(nodes, edges)
dendrogram = paris.fit_transform(adjacency)

A shape:  (509, 509)
0/79
1/79
2/79
3/79
4/79
5/79
6/79
7/79
8/79


### Lukes partitioning

In [8]:
max_size = 100 
community_lukes = community.lukes_partitioning(G, max_size, edge_weight='weight')

NotATree: ignored

## JanFeb2020

In [None]:
period = '_JanFeb2020'
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')

edges = pd.read_csv('/content/edgelist_China_USA'+period+'.csv')
edges_China = pd.read_csv('/content/edgelist_China'+period+'.csv')
edges_USA = pd.read_csv('/content/edgelist_USA'+period+'.csv')

text_China = extract_text(China)
text_USA = extract_text(USA)

freq_dict_China = frequency_dictionary(text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

net, net_df = network_from_edges(edges)
net_China, net_df_China = network_from_edges(edges_China)
net_USA, net_df_USA = network_from_edges(edges_USA)

G = get_graph(net)
G_China = get_graph(net_China)
G_USA = get_graph(net_USA)

In [None]:
print('China&USA:')
print('Nodes: ',len(G.nodes()))
print('Edges: ',len(G.edges()))
print('Is connected: ',nx.is_connected(G))
print()
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()

China&USA:
Nodes:  454
Edges:  3254
Is connected:  True

China:
Nodes:  300
Edges:  1391
Is connected:  False

USA:
Nodes:  435
Edges:  1746
Is connected:  True



### Bipartition
Partition a graph into two blocks using the Kernighan–Lin algorithm.

In [None]:
community_bipart = community.kernighan_lin_bisection(G,weight='weight')

In [None]:
print(community_bipart[0])
print(community_bipart[1])

{'officer', 'cough', 'child', 'singapore', 'quarter', 'run', 'nearly', 'read', 'major', 'human', 'fast', 'ban', 'border', 'phone', 'evacuee', 'infected', 'charter', 'urge', 'mother', 'temporarily', 'top', 'research', 'receive', 'trial', 'visit', 'must', 'investor', 'industry', 'yuan', 'special', 'california', 'miss', 'wash', 'move', 'news', 'increase', 'plan', 'fall', 'life', 'critical', 'strand', 'restriction', 'hard', 'group', 'growth', 'try', 'billion', 'street', 'force', 'offer', 'six', 'student', 'australia', 'front', 'contract', 'several', 'talk', 'near', 'well', 'affect', 'bank', 'slow', 'today', 'order', 'university', 'wang', 'apple', 'low', 'baby', 'recently', 'wall', 'headline', 'diagnose', 'potential', 'avoid', 'turn', 'issue', 'aboard', 'robot', 'strain', 'almost', 'condition', 'scientist', 'way', 'animal', 'base', 'wish', 'share', 'away', 'situation', 'look', 'still', 'dozen', 'wear', 'nurse', 'demand', 'respiratory', 'tokyo', 'meeting', 'place', 'fly', 'trump', 'develop',

In [None]:
threshold = 0.00001
sum_China_0, sum_USA_0, sum_China_USA_0 = community_check(community_bipart[0], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
sum_China_1, sum_USA_1, sum_China_USA_1 = community_check(community_bipart[1], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)

In [None]:
print('First community:')
print('China: ', round(100 * sum_China_0 / len(community_bipart[0])),'%')
print('USA: ', round(100 * sum_USA_0 / len(community_bipart[0])),'%')
print('China&USA: ', round(100 * sum_China_USA_0 / len(community_bipart[0])),'%')
print()
print('China: ', round(100 * sum_China_1 / len(community_bipart[1])),'%')
print('USA: ', round(100 * sum_USA_1 / len(community_bipart[1])),'%')
print('China&USA: ', round(100 * sum_China_USA_1 / len(community_bipart[1])),'%')

First community:
China:  42 %
USA:  52 %
China&USA:  5 %

China:  46 %
USA:  51 %
China&USA:  3 %


### Modularity-based communities
Find communities in G using greedy modularity maximization.

In [None]:
community_mod = community.greedy_modularity_communities(G, n_communities=2, weight='weight')
# communities_naive_mod = community.naive_greedy_modularity_communities(G)

In [None]:
community_mod_list = []
for comm in community_mod:
 print(list(comm))
 community_mod_list.append(list(comm))

['officer', 'cough', 'quarter', 'singapore', 'medium', 'run', 'nearly', 'fear', 'ban', 'korea', 'evacuee', 'suspend', 'infected', 'charter', 'urge', 'temporarily', 'trial', 'investor', 'industry', 'rate', 'california', 'wash', 'move', 'news', 'plan', 'fall', 'south', 'restriction', 'strand', 'hard', 'high', 'travel', 'growth', 'try', 'see', 'billion', 'street', 'force', 'six', 'front', 'contract', 'australia', 'warn', 'big', 'several', 'near', 'well', 'affect', 'university', 'bank', 'today', 'slow', 'order', 'economy', 'apple', 'low', 'recently', 'wall', 'headline', 'diagnose', 'potential', 'avoid', 'since', 'issue', 'turn', 'strain', 'almost', 'base', 'share', 'far', 'still', 'dozen', 'wear', 'demand', 'respiratory', 'meeting', 'tokyo', 'drug', 'place', 'fly', 'come', 'asia', 'impact', 'statement', 'may', 'office', 'need', 'thailand', 'airport', 'coronavirus', 'get', 'much', 'family', 'hundred', 'arrive', 'food', 'limit', 'provide', 'close', 'hit', 'social', 'police', 'anti', 'part', 

In [None]:
threshold = 0.00001
for i in range(len(community_mod_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_mod_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_mod_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_mod_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_mod_list[i])),'%')
  print()

Community:  0
China:  26 %
USA:  69 %
China&USA:  4 %

Community:  1
China:  63 %
USA:  34 %
China&USA:  2 %

Community:  2
China:  46 %
USA:  46 %
China&USA:  8 %

Community:  3
China:  76 %
USA:  20 %
China&USA:  5 %

Community:  4
China:  47 %
USA:  47 %
China&USA:  6 %

Community:  5
China:  19 %
USA:  78 %
China&USA:  0 %

Community:  6
China:  29 %
USA:  71 %
China&USA:  0 %



### Louvain Community Detection
Find the best partition of a graph using the Louvain Community Detection Algorithm.

In [None]:
# communities_louvain = community.louvain.louvain_communities(G)
# partitions_louvain = community.louvain.louvain_partitions(G)

community_louv = community_louvain.best_partition(G,weight='weight')

# modularity_louvain = community.modularity(communities_louvain, G) # ERROR: not a partition
# print("The modularity Q based on networkx is {}".format(modularity_louvain))

In [None]:
max = np.max(list(community_louv.values()))
community_louvain_list = []
for i in range(max+1):
  community_louvain_list.append([])
for key in list(community_louv.keys()):
  community_louvain_list[community_louv[key]].append(key)
for i in range(max+1):
  print(community_louvain_list[i])

['live', 'brief', 'late', 'give', 'update', 'around', 'development', 'follow', 'beyond', 'spreading']
['talk', 'china', 'coronavirus', 'novel', 'outbreak', 'university', 'student', 'face', 'deal', 'medium', 'watch', 'make', 'take', 'hard', 'announce', 'team', 'expert', 'among', 'country', 'asia', 'state', 'information', 'office', 'across', 'line', 'since', 'medic', 'arrive', 'support', 'local', 'help', 'fight', 'cut', 'firm', 'express', 'contain', 'epidemic', 'nearly', 'risk', 'drug', 'ensure', 'trump', 'try', 'share', 'effort', 'send', 'amid', 'offer', 'system', 'foreign', 'wang', 'mask', 'donate', 'show', 'australia', 'daily', 'begin', 'back', 'work', 'service', 'industry', 'combat', 'get', 'business', 'issue', 'plan', 'possible', 'high', 'raise', 'woman', 'measure', 'south', 'leave', 'home', 'city', 'food', 'come', 'fund', 'battle', 'billion', 'dollar', 'win', 'meet', 'due', 'big', 'beijing', 'economy', 'fast', 'part', 'slow', 'many', 'find', 'aid', 'may', 'stand', 'infected', 'see'

In [None]:
threshold = 0.00001
for i in range(len(community_louvain_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_louvain_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_louvain_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_louvain_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_louvain_list[i])),'%')
  print()

Community:  0
China:  40 %
USA:  60 %
China&USA:  0 %

Community:  1
China:  42 %
USA:  54 %
China&USA:  3 %

Community:  2
China:  74 %
USA:  24 %
China&USA:  2 %

Community:  3
China:  49 %
USA:  43 %
China&USA:  9 %

Community:  4
China:  17 %
USA:  76 %
China&USA:  3 %

Community:  5
China:  43 %
USA:  50 %
China&USA:  7 %



## SeptOct2020

In [None]:
period = '_SeptOct2020'
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')

edges = pd.read_csv('/content/edgelist_China_USA'+period+'.csv')
edges_China = pd.read_csv('/content/edgelist_China'+period+'.csv')
edges_USA = pd.read_csv('/content/edgelist_USA'+period+'.csv')

text_China = extract_text(China)
text_USA = extract_text(USA)

freq_dict_China = frequency_dictionary(text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

net, net_df = network_from_edges(edges)
net_China, net_df_China = network_from_edges(edges_China)
net_USA, net_df_USA = network_from_edges(edges_USA)

G = get_graph(net)
G_China = get_graph(net_China)
G_USA = get_graph(net_USA)

In [None]:
print('China&USA:')
print('Nodes: ',len(G.nodes()))
print('Edges: ',len(G.edges()))
print('Is connected: ',nx.is_connected(G))
print()
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()

China&USA:
Nodes:  507
Edges:  5768
Is connected:  True

China:
Nodes:  173
Edges:  556
Is connected:  False

USA:
Nodes:  512
Edges:  4963
Is connected:  True



### Bipartition
Partition a graph into two blocks using the Kernighan–Lin algorithm.

In [None]:
community_bipart = community.kernighan_lin_bisection(G,weight='weight')

In [None]:
print(community_bipart[0])
print(community_bipart[1])

{'sen', 'medium', 'crisis', 'child', 'available', 'major', 'speaker', 'korea', 'protest', 'outside', 'urge', 'post', 'free', 'ahead', 'seek', 'within', 'research', 'receive', 'push', 'without', 'secretary', 'third', 'italy', 'industry', 'physician', 'challenge', 'boom', 'right', 'already', 'move', 'group', 'breaking', 'life', 'hard', 'might', 'catch', 'delay', 'try', 'billion', 'package', 'school', 'street', 'bar', 'six', 'offer', 'supreme', 'democrat', 'senator', 'student', 'win', 'several', 'novel', 'relief', 'vulnerable', 'contain', 'talk', 'small', 'seven', 'reopen', 'milestone', 'bank', 'slow', 'today', 'effort', 'order', 'court', 'prevention', 'voter', 'worry', 'epidemic', 'benefit', 'likely', 'pay', 'federal', 'local', 'director', 'happen', 'boost', 'message', 'avoid', 'yet', 'control', 'turn', 'gathering', 'decline', 'almost', 'impose', 'scientist', 'war', 'way', 'employee', 'winter', 'young', 'stimulus', 'nancy', 'team', 'north', 'play', 'base', 'share', 'prevent', 'change', '

In [None]:
threshold = 0.00001
sum_China_0, sum_USA_0, sum_China_USA_0 = community_check(community_bipart[0], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
sum_China_1, sum_USA_1, sum_China_USA_1 = community_check(community_bipart[1], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)

In [None]:
print('First community:')
print('China: ', round(100 * sum_China_0 / len(community_bipart[0])),'%')
print('USA: ', round(100 * sum_USA_0 / len(community_bipart[0])),'%')
print('China&USA: ', round(100 * sum_China_USA_0 / len(community_bipart[0])),'%')
print()
print('China: ', round(100 * sum_China_1 / len(community_bipart[1])),'%')
print('USA: ', round(100 * sum_USA_1 / len(community_bipart[1])),'%')
print('China&USA: ', round(100 * sum_China_USA_1 / len(community_bipart[1])),'%')

First community:
China:  29 %
USA:  64 %
China&USA:  6 %

China:  39 %
USA:  57 %
China&USA:  5 %


### Modularity-based communities
Find communities in G using greedy modularity maximization.

In [None]:
community_mod = community.greedy_modularity_communities(G, n_communities=2, weight='weight')
# communities_naive_mod = community.naive_greedy_modularity_communities(G)

In [None]:
community_mod_list = []
for comm in community_mod:
 print(list(comm))
 community_mod_list.append(list(comm))

['available', 'protest', 'post', 'authorization', 'free', 'johnson', 'russia', 'seek', 'receive', 'push', 'trial', 'third', 'industry', 'challenge', 'right', 'second', 'news', 'plan', 'group', 'life', 'catch', 'delay', 'try', 'billion', 'response', 'offer', 'win', 'want', 'patient', 'contain', 'bank', 'concern', 'worry', 'find', 'make', 'adult', 'likely', 'benefit', 'emergency', 'pay', 'covid', 'boost', 'potential', 'end', 'help', 'gathering', 'scientist', 'employee', 'young', 'play', 'share', 'worker', 'late', 'french', 'company', 'dose', 'drug', 'develop', 'support', 'old', 'open', 'development', 'enough', 'may', 'need', 'full', 'safe', 'get', 'financial', 'would', 'approval', 'party', 'institute', 'rule', 'food', 'effective', 'join', 'even', 'age', 'vaccine', 'infect', 'participant', 'provide', 'reduce', 'long', 'social', 'british', 'part', 'risk', 'show', 'fund', 'maker', 'compare', 'raise', 'protect', 'soon', 'immunity', 'use', 'facility', 'like', 'serious', 'promise', 'become', '

In [None]:
threshold = 0.00001
for i in range(len(community_mod_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_mod_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_mod_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_mod_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_mod_list[i])),'%')
  print()

Community:  0
China:  23 %
USA:  72 %
China&USA:  5 %

Community:  1
China:  30 %
USA:  65 %
China&USA:  5 %

Community:  2
China:  50 %
USA:  44 %
China&USA:  6 %

Community:  3
China:  50 %
USA:  50 %
China&USA:  0 %

Community:  4
China:  25 %
USA:  75 %
China&USA:  0 %

Community:  5
China:  0 %
USA:  100 %
China&USA:  0 %



### Louvain Community Detection
Find the best partition of a graph using the Louvain Community Detection Algorithm.

In [None]:
# communities_louvain = community.louvain.louvain_communities(G)
# partitions_louvain = community.louvain.louvain_partitions(G)

community_louv = community_louvain.best_partition(G,weight='weight')

# modularity_louvain = community.modularity(communities_louvain, G) # ERROR: not a partition
# print("The modularity Q based on networkx is {}".format(modularity_louvain))

In [None]:
max = np.max(list(community_louv.values()))
community_louvain_list = []
for i in range(max+1):
  community_louvain_list.append([])
for key in list(community_louv.keys()):
  community_louvain_list[community_louv[key]].append(key)
for i in range(max+1):
  print(community_louvain_list[i])

['daily', 'case', 'coronavirus', 'infection', 'high', 'since', 'number', 'new', 'hit', 'record', 'italy', 'week', 'report', 'health', 'time', 'increase', 'death', 'month', 'country', 'rise', 'france', 'see', 'expert', 'surpass', 'million', 'toll', 'confirm', 'one', 'resurgence', 'accord', 'ministry', 'register', 'national', 'student', 'university', 'local', 'across', 'pass', 'today', 'second', 'wave', 'surge', 'bring', 'total', 'tally', 'india', 'brazil', 'winter', 'could', 'milestone', 'outbreak', 'spread', 'measure', 'past', 'hour', 'city', 'far', 'day', 'state', 'set', 'york', 'qingdao', 'last', 'fall', 'germany', 'start', 'spike', 'two', 'authority', 'six', 'school', 'region', 'restriction', 'seven', 'nine', 'nationwide', 'three', 'decline', 'nearly', 'rate', 'australia', 'low', 'level', 'ease', 'another', 'warn', 'half', 'force', 'almost', 'research', 'trend', 'slow', 'task', 'novel', 'home', 'business', 'mayor', 'street', 'resident', 'governor', 'allow', 'previous', 'reopen']
['c

In [None]:
threshold = 0.00001
for i in range(len(community_louvain_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_louvain_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_louvain_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_louvain_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_louvain_list[i])),'%')
  print()

Community:  0
China:  54 %
USA:  39 %
China&USA:  7 %

Community:  1
China:  29 %
USA:  66 %
China&USA:  5 %

Community:  2
China:  24 %
USA:  74 %
China&USA:  2 %

Community:  3
China:  31 %
USA:  62 %
China&USA:  6 %



## MarchApril2021

In [None]:
period = '_MarchApril2021'
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')

edges = pd.read_csv('/content/edgelist_China_USA'+period+'.csv')
edges_China = pd.read_csv('/content/edgelist_China'+period+'.csv')
edges_USA = pd.read_csv('/content/edgelist_USA'+period+'.csv')

text_China = extract_text(China)
text_USA = extract_text(USA)

freq_dict_China = frequency_dictionary(text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

net, net_df = network_from_edges(edges)
net_China, net_df_China = network_from_edges(edges_China)
net_USA, net_df_USA = network_from_edges(edges_USA)

G = get_graph(net)
G_China = get_graph(net_China)
G_USA = get_graph(net_USA)

In [None]:
print('China&USA:')
print('Nodes: ',len(G.nodes()))
print('Edges: ',len(G.edges()))
print('Is connected: ',nx.is_connected(G))
print()
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()

China&USA:
Nodes:  500
Edges:  3018
Is connected:  True

China:
Nodes:  206
Edges:  638
Is connected:  True

USA:
Nodes:  491
Edges:  2285
Is connected:  True



### Bipartition
Partition a graph into two blocks using the Kernighan–Lin algorithm.

In [None]:
community_bipart = community.kernighan_lin_bisection(G,weight='weight')

In [None]:
print(community_bipart[0])
print(community_bipart[1])

{'child', 'number', 'disease', 'nearly', 'time', 'say', 'test', 'korea', 'agency', 'ban', 'urge', 'johnson', 'toll', 'top', 'free', 'russia', 'research', 'receive', 'trial', 'surge', 'third', 'amid', 'committee', 'track', 'rate', 'special', 'batch', 'second', 'expand', 'dos', 'south', 'group', 'increase', 'fall', 'restriction', 'clot', 'plan', 'high', 'travel', 'delay', 'see', 'response', 'school', 'death', 'six', 'offer', 'australia', 'warn', 'patient', 'variant', 'government', 'prevention', 'university', 'concern', 'link', 'order', 'find', 'could', 'make', 'adult', 'donate', 'benefit', 'outweigh', 'low', 'china', 'give', 'around', 'province', 'include', 'record', 'infection', 'emergency', 'local', 'covid', 'federal', 'blood', 'director', 'official', 'case', 'efficacy', 'eligible', 'end', 'control', 'help', 'since', 'global', 'across', 'war', 'animal', 'way', 'base', 'young', 'call', 'intensive', 'prevent', 'african', 'team', 'far', 'hospital', 'first', 'world', 'still', 'city', 'comp

In [None]:
threshold = 0.00001
sum_China_0, sum_USA_0, sum_China_USA_0 = community_check(community_bipart[0], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
sum_China_1, sum_USA_1, sum_China_USA_1 = community_check(community_bipart[1], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)

In [None]:
print('First community:')
print('China: ', round(100 * sum_China_0 / len(community_bipart[0])),'%')
print('USA: ', round(100 * sum_USA_0 / len(community_bipart[0])),'%')
print('China&USA: ', round(100 * sum_China_USA_0 / len(community_bipart[0])),'%')
print()
print('China: ', round(100 * sum_China_1 / len(community_bipart[1])),'%')
print('USA: ', round(100 * sum_USA_1 / len(community_bipart[1])),'%')
print('China&USA: ', round(100 * sum_China_USA_1 / len(community_bipart[1])),'%')

First community:
China:  39 %
USA:  59 %
China&USA:  2 %

China:  30 %
USA:  63 %
China&USA:  6 %


### Modularity-based communities
Find communities in G using greedy modularity maximization.

In [None]:
community_mod = community.greedy_modularity_communities(G, n_communities=2, weight='weight')
# communities_naive_mod = community.naive_greedy_modularity_communities(G)

In [None]:
community_mod_list = []
for comm in community_mod:
 print(list(comm))
 community_mod_list.append(list(comm))

['easter', 'quarter', 'child', 'crisis', 'number', 'nearly', 'fear', 'major', 'human', 'time', 'border', 'test', 'drop', 'outbreak', 'toll', 'ahead', 'seek', 'research', 'push', 'massive', 'third', 'visit', 'surge', 'amid', 'challenge', 'rate', 'california', 'mark', 'move', 'forecast', 'news', 'increase', 'life', 'restriction', 'hard', 'high', 'travel', 'growth', 'fourth', 'see', 'school', 'street', 'death', 'suffer', 'big', 'warn', 'patient', 'relief', 'measure', 'variant', 'affect', 'seven', 'reopen', 'today', 'worry', 'hold', 'could', 'great', 'economy', 'likely', 'province', 'every', 'low', 'record', 'beach', 'infection', 'pressure', 'pay', 'local', 'covid', 'case', 'yet', 'since', 'york', 'help', 'gathering', 'global', 'almost', 'decline', 'strain', 'die', 'impose', 'intensive', 'share', 'curb', 'hospital', 'worker', 'change', 'longer', 'less', 'recover', 'french', 'city', 'demand', 'lift', 'tokyo', 'story', 'think', 'place', 'leave', 'old', 'open', 'hour', 'report', 'week', 'anal

In [None]:
threshold = 0.00001
for i in range(len(community_mod_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_mod_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_mod_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_mod_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_mod_list[i])),'%')
  print()

Community:  0
China:  39 %
USA:  54 %
China&USA:  6 %

Community:  1
China:  40 %
USA:  57 %
China&USA:  3 %

Community:  2
China:  17 %
USA:  81 %
China&USA:  1 %

Community:  3
China:  40 %
USA:  60 %
China&USA:  0 %

Community:  4
China:  12 %
USA:  82 %
China&USA:  6 %

Community:  5
China:  15 %
USA:  77 %
China&USA:  8 %

Community:  6
China:  55 %
USA:  45 %
China&USA:  0 %

Community:  7
China:  43 %
USA:  57 %
China&USA:  0 %

Community:  8
China:  0 %
USA:  100 %
China&USA:  0 %



### Louvain Community Detection
Find the best partition of a graph using the Louvain Community Detection Algorithm.

In [None]:
# communities_louvain = community.louvain.louvain_communities(G)
# partitions_louvain = community.louvain.louvain_partitions(G)

community_louv = community_louvain.best_partition(G,weight='weight')

# modularity_louvain = community.modularity(communities_louvain, G) # ERROR: not a partition
# print("The modularity Q based on networkx is {}".format(modularity_louvain))

In [None]:
max = np.max(list(community_louv.values()))
community_louvain_list = []
for i in range(max+1):
  community_louvain_list.append([])
for key in list(community_louv.keys()):
  community_louvain_list[community_louv[key]].append(key)
for i in range(max+1):
  print(community_louvain_list[i])

['find', 'expert', 'top', 'risk', 'shot', 'give', 'drug', 'germany', 'side', 'confirm', 'recommend', 'review', 'blood', 'clot', 'link', 'rare', 'medicine', 'agency', 'possible', 'due', 'effect', 'brain', 'committee', 'benefit', 'johnson', 'single', 'six', 'investigate', 'decide', 'next', 'woman', 'federal', 'regulator', 'disorder', 'problem', 'cause', 'soon', 'pause', 'outweigh', 'infectious', 'decision']
['one', 'people', 'first', 'time', 'receive', 'least', 'dose', 'adult', 'population', 'get', 'vaccinate', 'announce', 'jab', 'extra', 'take', 'make', 'vaccination', 'office', 'mass', 'site', 'become', 'half', 'president', 'drive', 'detect', 'university', 'require', 'student', 'young', 'joe', 'eligible', 'effort', 'way', 'campaign', 'mask', 'back', 'goal', 'fully', 'among', 'free', 'come', 'target', 'fall', 'person', 'card']
['covid', 'third', 'wave', 'low', 'pandemic', 'pass', 'bring', 'could', 'response', 'black', 'america', 'lose', 'child', 'close', 'like', 'claim', 'lift', 'restric

In [None]:
threshold = 0.00001
for i in range(len(community_louvain_list)):
  sum_China, sum_USA, sum_China_USA = community_check(community_louvain_list[i], freq_dict_China, freq_dict_USA, G_China, G_USA, threshold)
  print('Community: ', i)
  print('China: ', round(100 * sum_China / len(community_louvain_list[i])),'%')
  print('USA: ', round(100 * sum_USA / len(community_louvain_list[i])),'%')
  print('China&USA: ', round(100 * sum_China_USA / len(community_louvain_list[i])),'%')
  print()

Community:  0
China:  12 %
USA:  85 %
China&USA:  2 %

Community:  1
China:  31 %
USA:  67 %
China&USA:  0 %

Community:  2
China:  29 %
USA:  62 %
China&USA:  8 %

Community:  3
China:  58 %
USA:  42 %
China&USA:  0 %

Community:  4
China:  27 %
USA:  73 %
China&USA:  0 %

Community:  5
China:  43 %
USA:  53 %
China&USA:  4 %

Community:  6
China:  43 %
USA:  57 %
China&USA:  0 %

