In [1]:
#notebook to download the csv of edges and nodes of a given network
import os
import requests 
import time
import string
import networkx as nx
import itertools
import pandas as pd
import json
import re
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

from nltk.corpus import wordnet as wn #importing it
from nltk.stem.wordnet import WordNetLemmatizer #importing wordnet lemmatizer
from nltk import pos_tag #part-of-speech-tagger
from collections import defaultdict #defaultdict returns default value for non-existant keys you try to  access based on the function you passed in the constructor
from google.colab import files

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def extract_text(df):       #extract the text from the tweets and RT
                            #works ONLY on .csv file
  list_strings = []
  for index in range(len(df)):
    if index % 1000 == 0:
      print(str(index)+' / '+str(len(df)))
    text = df.loc[index]['text']                          #if it is nor trucated nor a RT  i take "text"
    string = -1
    if (df.loc[index,"truncated"] == True):                 #if it is trucated I take "extended_tweet"
        string = df.loc[index,"extended_tweet"]
    if type(df.loc[index,"retweeted_status"]) != float:     #if it is a RT I take retweeted_status
        string = df.loc[index,"retweeted_status"]
    if type(string) == str :
        if(re.search('full_text\':(.+?)https',string) != None):     #if I find "full_text"
          s = re.search('full_text\':(.+?)https',string).group(1)
        if(re.search('text\':(.+?)https',string)!= None):
          s = re.search('text\':(.+?)https',string).group(1)
        else: 
          continue
        list_strings.append(s)
        #print(s)         
    else:
      list_strings.append(text)
      #print(text)
      

  return list_strings

In [3]:
# Cleaning, lemmatising and pos tagging tweets

nltk.download('words')
WORDS = set(nltk.corpus.words.words()) #the last two lines serve to download the corpus of standard English language words
nltk.download('stopwords') #downloading stopwords
STOP_WORDS = set(nltk.corpus.stopwords.words("english")) #taking the stop words from English language
nltk.download('wordnet') #downloading wordnet
nltk.download('averaged_perceptron_tagger') #downloading tagger
tag_map = defaultdict(lambda : wn.NOUN) #here we define that wn.NOUN is the default value for the dict
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def lemma_pos_cleaner(tweet):

    tweet = re.sub("@[A-Za-z0-9]+","",tweet) # remove mentions
    tweet = re.sub("#[A-Za-z0-9]+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    tweet = " ".join(tweet.split())
    tweet = str.lower(tweet) #to lowercase 
    tweet = re.sub("'"," ",tweet) # remove aphostrophe

    #basically we use pos_tag function on tokens that we get by applying wordpunct tokenization
    #to tweet (it separates all the words and symbols)
    #then we pass the token along with it's wordnet pos value that we get from the tag_map dictionary (noun, adjective, verb or adverb) to the lemma function (the WordNetLemmatizer())
    lemma_function = WordNetLemmatizer()
    tweet = " ".join(lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(tweet))) #lemmatize
  

    # francesco: I removed also all 2 letters words and added specific words, words that appears frequently but are discarded because they are not in the english language
    SPECIFIC_WORDS = ['virus', 'coronavirus', 'covid19', 'covid', 'trump', 'hubei', 'beijing', 'xinjiang', 'jinping', 'korea', 'xinhua', 'india', 'taiwan','johnson','singapore', 'africa', 'japanese', 'france', 'asian', 'australia', 'french', 'asia', 'leishenshan', 'british', 'qingdao', 'fauci', 'america',  'california', 'sichuan', 'malaysia', 'huawei','thailand', 'shandong', 'italy', 'philippines', 'germany', 'facebook', 'african', 'shenzhen', 'tokyo', 'russian','uygur', '5g', 'pompeo', 'vietnam', 'australian', 'cambodia', 'zhejiang', 'yunnan', 'guangdong', 'korean', 'iran', 'washington']
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) if (w in WORDS or w in SPECIFIC_WORDS) and len(w)>2 and w not in STOP_WORDS ) #remove stop words
   
    return tweet

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
def frequency_dictionary(df):
  unique_words = {}

  for row in df:
    for word in row.split():
      #if the word is encountered for the first time add to dict as key and set its value to 0
      unique_words.setdefault(word,0)
      #increase the value (i.e the count) of the word by 1 every time it is encountered
      unique_words[word] += 1

  return unique_words

In [50]:
# discard less frequent words
def clean_words(cleaned_text, unuseful_words):
  re_cleaned_text = cleaned_text.copy()
  for txt in range(len(re_cleaned_text)):
    if txt % 2000 == 0:
      print(txt, '/',len(re_cleaned_text))
    w = re_cleaned_text[txt].split()
    for word in unuseful_words:
      while word in w:
        w.remove(word)
    re_cleaned_text[txt] = ' '.join(w)
  return re_cleaned_text

In [11]:
period = '_JanFeb2020'  # '', '_JanFeb2020', '_MarchApril2021', '_SeptOct2020'

In [12]:
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')
China_USA = pd.read_csv('/content/China&USA'+period+'.csv')

In [13]:
# number of tweets:
print('China: ', len(China))
print('USA: ', len(USA))
print('China&USA: ', len(China_USA))

China:  2459
USA:  3185
China&USA:  5644


In [14]:
text_China = extract_text(China)
text_USA = extract_text(USA)
text_China_USA = extract_text(China_USA)

0 / 2459
1000 / 2459
2000 / 2459
0 / 3185
1000 / 3185
2000 / 3185
3000 / 3185
0 / 5644
1000 / 5644
2000 / 5644
3000 / 5644
4000 / 5644
5000 / 5644


In [15]:
cleaned_text_China = [lemma_pos_cleaner(txt) for txt in text_China]
cleaned_text_USA = [lemma_pos_cleaner(txt) for txt in text_USA]
cleaned_text_China_USA = [lemma_pos_cleaner(txt) for txt in text_China_USA]

print('China:')
print(cleaned_text_China[0:10])
print()
print('USA:')
print(cleaned_text_USA[0:10])
print()
print('China&USA:')
print(cleaned_text_China_USA[0:10])

China:
['talk university challenge china face deal misconception people medium watch', 'traditional medicine use battle covid answer may surprise', 'italy report first death year old man northern new case report', 'evacuee diamond princess cruise ship land hong hong infect', 'fear stop one couple get marry wedding guest include couple wore surgical mask ceremony', 'group molecular biologist might invent portable device detect infect', 'china world make great stride track infectious disease take unexpected turn make hard track', 'member expert advisory committee say important use technology fight', 'world health organization announce lead team international expert currently china', 'outside china confirm case novel covid half cruise ship dock japan rest scatter among country mostly asia']

USA:
['coronavirus bring anti sentiment south korea', 'coronavirus issue could come bad time start line muller boston university', 'claim two life iran', 'happen catch new', 'vaccine available', 'firs

In [16]:
freq_dict_China = frequency_dictionary(cleaned_text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(cleaned_text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_China_USA = frequency_dictionary(cleaned_text_China_USA)
freq_dict_China_USA = dict(sorted(freq_dict_China_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

# number of words in the cleaned tweets:
print('China: ', len(list(freq_dict_China)))
print('USA: ', len(list(freq_dict_USA)))
print('China&USA: ', len(list(freq_dict_China_USA)))

China:  2893
USA:  2740
China&USA:  3946


In [25]:
# Most frequent words
print('China')
print([key for key in freq_dict_China.keys() if freq_dict_China[key]>100])
print()
print('USA')
print([key for key in freq_dict_USA.keys() if freq_dict_USA[key]>100])
print()
print('China&USA')
print([key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]>200])

China
['china', 'novel', 'coronavirus', 'case', 'outbreak', 'hospital', 'patient', 'new', 'fight', 'say', 'confirm', 'people', 'hubei', 'epidemic', 'report', 'medical', 'live', 'health', 'province', 'battle', 'death', 'control', 'support', 'amid', 'day', 'discharge', 'virus', 'infection', 'take', 'official', 'covid', 'first', 'city', 'country', 'effort', 'one', 'infect']

USA
['coronavirus', 'china', 'outbreak', 'case', 'say', 'new', 'spread', 'death', 'health', 'people', 'late', 'cruise', 'ship', 'first', 'report', 'confirm', 'world', 'quarantine', 'virus', 'test', 'japan', 'country', 'fear', 'toll', 'global', 'state', 'travel', 'rise', 'city', 'two', 'day', 'official', 'due', 'hong', 'flight', 'infect', 'beyond', 'number', 'hit', 'passenger', 'amid', 'novel']

China&USA
['coronavirus', 'china', 'outbreak', 'case', 'novel', 'new', 'say', 'hospital', 'people', 'spread', 'death', 'health', 'confirm', 'patient', 'report', 'fight', 'late', 'first', 'virus', 'world', 'hubei', 'cruise', 'da

# less frequent words: not performed on data with only covid, coronavirus, vaccine keys

In [32]:
# less frequent words:
thr = 2
print('Less frequent China: ', len([key for key in freq_dict_China.keys() if freq_dict_China[key]<thr]))
print('More frequent China: ', len([key for key in freq_dict_China.keys() if freq_dict_China[key]>=thr]))
less_frequent_words_China = [key for key in freq_dict_China.keys() if freq_dict_China[key]<thr]
print()
print('Less frequent USA: ', len([key for key in freq_dict_USA.keys() if freq_dict_USA[key]<thr]))
print('More frequent USA:', len([key for key in freq_dict_USA.keys() if freq_dict_USA[key]>=thr]))
less_frequent_words_USA = [key for key in freq_dict_USA.keys() if freq_dict_USA[key]<thr]
print()
print('Less frequent China&USA: ', len([key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]<thr]))
print('More frequent China&USA: ', len([key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]>=thr]))
less_frequent_words_China_USA = [key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]<thr]

Less frequent China:  1132
More frequent China:  1761

Less frequent USA:  820
More frequent USA: 1920

Less frequent China&USA:  1288
More frequent China&USA:  2658


In [None]:
# discard less frequent words
def clean_words(cleaned_text, unuseful_words):
  re_cleaned_text = cleaned_text.copy()
  for txt in range(len(re_cleaned_text)):
    if txt % 2000 == 0:
      print(txt, '/',len(re_cleaned_text))
    w = re_cleaned_text[txt].split()
    for word in unuseful_words:
      while word in w:
        w.remove(word)
    re_cleaned_text[txt] = ' '.join(w)
  return re_cleaned_text

In [None]:
cleaned_mostfreq_text_China = clean_words(cleaned_text_China, less_frequent_words_China)
cleaned_mostfreq_text_USA = clean_words(cleaned_text_USA, less_frequent_words_USA)
cleaned_mostfreq_text_China_USA = clean_words(cleaned_text_China_USA, less_frequent_words_China_USA)

0 / 2438
2000 / 2438
0 / 3185
2000 / 3185
0 / 5623
2000 / 5623
4000 / 5623


In [None]:
freq_dict_China = frequency_dictionary(cleaned_mostfreq_text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(cleaned_mostfreq_text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_China_USA = frequency_dictionary(cleaned_mostfreq_text_China_USA)
freq_dict_China_USA = dict(sorted(freq_dict_China_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

# number of words in the cleaned tweets:
print('China: ', len(list(freq_dict_China)))
print('USA: ', len(list(freq_dict_USA)))
print('China&USA: ', len(list(freq_dict_China_USA)))

China:  497
USA:  576
China&USA:  941


# Build Network

In [33]:
df_China = pd.DataFrame.from_dict(freq_dict_China, orient='index').reset_index()
df_China.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China.sort_values(by=['Count'], ascending=False, inplace=True)
df_China.reset_index(inplace=True)
df_China.drop(columns="index",inplace=True)

df_USA = pd.DataFrame.from_dict(freq_dict_USA, orient='index').reset_index()
df_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_USA.reset_index(inplace=True)
df_USA.drop(columns="index",inplace=True)

df_China_USA = pd.DataFrame.from_dict(freq_dict_China_USA, orient='index').reset_index()
df_China_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_China_USA.reset_index(inplace=True)
df_China_USA.drop(columns="index",inplace=True)

print('China')
print(df_China.iloc[0:30])
print()
print('USA')
print(df_USA.iloc[0:30])
print()
print('China&USA')
print(df_China_USA.iloc[0:30])
print()

China
           Word  Count
0         china   1126
1         novel    834
2   coronavirus    560
3          case    496
4      outbreak    469
5      hospital    441
6       patient    338
7           new    328
8         fight    298
9           say    261
10      confirm    258
11       people    221
12        hubei    189
13     epidemic    187
14       report    175
15      medical    173
16         live    169
17       health    159
18     province    158
19       battle    157
20        death    152
21      control    149
22      support    145
23         amid    142
24          day    131
25    discharge    120
26        virus    118
27    infection    114
28         take    111
29     official    110

USA
           Word  Count
0   coronavirus   2696
1         china   1323
2      outbreak    631
3          case    539
4           say    397
5           new    394
6        spread    381
7         death    317
8        health    293
9        people    270
10         late    240


In [34]:
keys_China = freq_dict_China.keys()  
keys_USA = freq_dict_USA.keys()  
keys_China_USA = freq_dict_China_USA.keys()  

In [35]:
def create_network(cleaned_text):
  network = {}
  #connect the word that appear in the same tweets
  for row in cleaned_text:
    combined_list = [word for word in str.split(row)]
    #for pair in itertools.product(combined_list, combined_list):
    #print(combined_list)
    for pair in itertools.product(combined_list, combined_list):
          #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
          if pair[0]!=pair[1] and not(pair[::-1] in network):
              network.setdefault(pair,0)
              network[pair] += 1 
  network_df = pd.DataFrame.from_dict(network, orient="index")
  network_df.columns = ["weight"]
  network_df.sort_values(by="weight",inplace=True, ascending=False)
  return network, network_df

In [37]:
network_China, network_df_China = create_network(cleaned_text_China)
network_USA, network_df_USA = create_network(cleaned_text_USA)
network_China_USA, network_df_China_USA = create_network(cleaned_text_China_USA)

In [38]:
print('China:')
print(network_df_China.iloc[0:30])
print()
print('USA:')
print(network_df_USA.iloc[0:30])
print()
print('China&USA:')
print(network_df_China_USA.iloc[0:30])
print()

China:
                         weight
(china, novel)              493
(confirm, case)             323
(china, outbreak)           296
(china, coronavirus)        291
(novel, coronavirus)        280
(patient, hospital)         265
(new, case)                 256
(china, case)               236
(china, fight)              226
(novel, outbreak)           226
(report, case)              212
(death, case)               191
(say, china)                169
(china, new)                164
(new, confirm)              155
(china, support)            147
(china, confirm)            139
(report, new)               135
(case, novel)               132
(china, epidemic)           127
(case, coronavirus)         126
(death, new)                125
(fight, novel)              125
(hospital, novel)           122
(china, hospital)           120
(province, case)            119
(report, confirm)           119
(coronavirus, outbreak)     119
(discharge, hospital)       117
(china, battle)             117



#Graph


In [57]:
def get_graph(network):
  G = nx.Graph()
  for edge in network:
      G.add_edge(edge[0], edge[1], weight=network[edge])
  return G

In [41]:
G_China = get_graph(network_China)
G_USA = get_graph(network_USA)
G_China_USA = get_graph(network_China_USA)

In [42]:
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()
print('China&USA:')
print('Nodes: ',len(G_China_USA.nodes()))
print('Edges: ',len(G_China_USA.edges()))
print('Is connected: ',nx.is_connected(G_China_USA))

China:
Nodes:  2893
Edges:  83425
Is connected:  True

USA:
Nodes:  2740
Edges:  76627
Is connected:  True

China&USA:
Nodes:  3946
Edges:  144653
Is connected:  True


# PageRank

In [43]:
# Calculating the pagerank on graph G, teleportation probability here is 0.15 but since the graph is strongly connected we can set it to zero if we want
pr_China = nx.algorithms.pagerank(G_China,alpha = 1)
pr_China = dict(sorted(pr_China.items(), key=lambda item: item[1],reverse  = True))

pr_USA = nx.algorithms.pagerank(G_USA,alpha = 1)
pr_USA = dict(sorted(pr_USA.items(), key=lambda item: item[1],reverse  = True))

pr_China_USA = nx.algorithms.pagerank(G_China_USA,alpha = 1)
pr_China_USA = dict(sorted(pr_China_USA.items(), key=lambda item: item[1],reverse  = True))

In [44]:
def threshold(vector,threshold):

  l = [(el,vector[el]) for el in vector if vector[el] >= threshold ]

  return pd.DataFrame(l)

In [45]:
def threshold_reverse(vector,threshold):

  l = [(el,vector[el]) for el in vector if vector[el] < threshold ]

  return pd.DataFrame(l)

In [46]:
thr = 0.0004
print('China: ', len(threshold(pr_China,thr)))
print()
print(threshold(pr_China,thr).iloc[:30])
print()
print('USA: ', len(threshold(pr_USA,thr)))
print()
print(threshold(pr_USA,thr).iloc[:30])
print()
print('China&USA: ', len(threshold(pr_China_USA,thr)))
print()
print(threshold(pr_China_USA,thr).iloc[:30])

China:  467

              0         1
0         china  0.038716
1         novel  0.028719
2   coronavirus  0.017859
3          case  0.016549
4      outbreak  0.016076
5      hospital  0.014473
6       patient  0.011330
7           say  0.010865
8           new  0.010741
9         fight  0.010002
10      confirm  0.009148
11       people  0.008910
12     epidemic  0.006814
13        hubei  0.006706
14      medical  0.006684
15       health  0.006355
16     province  0.006287
17       report  0.006146
18        death  0.005576
19       battle  0.005470
20      support  0.005420
21        virus  0.005025
22      control  0.004980
23          day  0.004889
24          one  0.004765
25         amid  0.004547
26         city  0.004440
27         take  0.004437
28         live  0.004171
29      country  0.004169

USA:  467

              0         1
0   coronavirus  0.065659
1         china  0.034819
2      outbreak  0.019553
3          case  0.014994
4           say  0.012874
5           n

# TF-IDF: not performed

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,1))   # ngram range can be changed to obtain measures regarding n grams instead of single words

X_China = tfidf.fit_transform(cleaned_text_China).toarray()    # entry (i,j) if Tfidf measure of word_list[j] in document i
word_list_China = tfidf.get_feature_names_out()

X_USA = tfidf.fit_transform(cleaned_text_USA).toarray()
word_list_USA = tfidf.get_feature_names_out()

X_China_USA = tfidf.fit_transform(cleaned_text_China_USA).toarray()
word_list_China_USA = tfidf.get_feature_names_out()


In [None]:
tfidf_df_China = pd.DataFrame(X_China,columns = word_list_China)

tfidf_df_USA = pd.DataFrame(X_USA,columns = word_list_USA)

tfidf_df_China_USA = pd.DataFrame(X_China_USA,columns = word_list_China_USA)

In [None]:
tfidf_word_measure_China = np.mean(tfidf_df_China,axis = 0)
tfidf_word_measure_China = tfidf_word_measure_China.sort_values(ascending = False)
tfidf_word_measure_USA = np.mean(tfidf_df_USA,axis = 0)
tfidf_word_measure_USA = tfidf_word_measure_USA.sort_values(ascending = False)
tfidf_word_measure_China_USA = np.mean(tfidf_df_China_USA,axis = 0)
tfidf_word_measure_China_USA = tfidf_word_measure_China_USA.sort_values(ascending = False)

In [None]:
print('China:')
print(tfidf_word_measure_China[0:30])
print()
print('USA:')
print(tfidf_word_measure_USA[0:30])
print()
print('China&USA:')
print(tfidf_word_measure_China_USA[0:30])
print()

China:
covid          0.076069
watch          0.059429
coronavirus    0.047427
case           0.044577
china          0.037314
say            0.037011
new            0.035957
test           0.032966
report         0.031964
president      0.029373
pandemic       0.029153
vaccine        0.026741
trump          0.026549
update         0.026117
health         0.025600
country        0.024406
positive       0.023640
million        0.020711
world          0.020349
people         0.020237
death          0.019359
accord         0.017701
late           0.017458
sept           0.016890
year           0.016416
global         0.016072
day            0.015739
number         0.015450
national       0.014457
infection      0.014375
dtype: float64

USA:
covid          0.063696
coronavirus    0.043593
case           0.032740
say            0.031375
new            0.030973
vaccine        0.029819
trump          0.026958
test           0.026622
president      0.021916
report         0.021543
positive    

# reduced graph

In [47]:
# less important words:
less_important_words_China = list(threshold_reverse(pr_China,thr)[0])

less_important_words_USA = list(threshold_reverse(pr_USA,thr)[0])

less_important_words_China_USA = list(threshold_reverse(pr_China_USA,thr)[0])

In [51]:
cleaned_mostimp_text_China = clean_words(cleaned_text_China,less_important_words_China)
cleaned_mostimp_text_USA = clean_words(cleaned_text_USA,less_important_words_USA)
cleaned_mostimp_text_China_USA = clean_words(cleaned_text_China_USA,less_important_words_China_USA)

0 / 2438
2000 / 2438
0 / 3185
2000 / 3185
0 / 5623
2000 / 5623
4000 / 5623


In [52]:
freq_dict_China = frequency_dictionary(cleaned_mostimp_text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(cleaned_mostimp_text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_China_USA = frequency_dictionary(cleaned_mostimp_text_China_USA)
freq_dict_China_USA = dict(sorted(freq_dict_China_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

# number of words in the cleaned tweets:
print('China: ', len(list(freq_dict_China)))
print('USA: ', len(list(freq_dict_USA)))
print('China&USA: ', len(list(freq_dict_China_USA)))

China:  467
USA:  467
China&USA:  460


In [53]:
df_China = pd.DataFrame.from_dict(freq_dict_China, orient='index').reset_index()
df_China.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China.sort_values(by=['Count'], ascending=False, inplace=True)
df_China.reset_index(inplace=True)
df_China.drop(columns="index",inplace=True)

df_USA = pd.DataFrame.from_dict(freq_dict_USA, orient='index').reset_index()
df_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_USA.reset_index(inplace=True)
df_USA.drop(columns="index",inplace=True)

df_China_USA = pd.DataFrame.from_dict(freq_dict_China_USA, orient='index').reset_index()
df_China_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_China_USA.reset_index(inplace=True)
df_China_USA.drop(columns="index",inplace=True)

print('China')
print(df_China.iloc[0:30])
print()
print('USA')
print(df_USA.iloc[0:30])
print()
print('China&USA')
print(df_China_USA.iloc[0:30])
print()

China
           Word  Count
0         china   1126
1         novel    834
2   coronavirus    560
3          case    496
4      outbreak    469
5      hospital    441
6       patient    338
7           new    328
8         fight    298
9           say    261
10      confirm    258
11       people    221
12        hubei    189
13     epidemic    187
14       report    175
15      medical    173
16         live    169
17       health    159
18     province    158
19       battle    157
20        death    152
21      control    149
22      support    145
23         amid    142
24          day    131
25    discharge    120
26        virus    118
27    infection    114
28         take    111
29     official    110

USA
           Word  Count
0   coronavirus   2696
1         china   1323
2      outbreak    631
3          case    539
4           say    397
5           new    394
6        spread    381
7         death    317
8        health    293
9        people    270
10         late    240


In [54]:
keys_China = freq_dict_China.keys()  
keys_USA = freq_dict_USA.keys()  
keys_China_USA = freq_dict_China_USA.keys()  

In [55]:
network_China, network_df_China = create_network(cleaned_mostimp_text_China)
network_USA, network_df_USA = create_network(cleaned_mostimp_text_USA)
network_China_USA, network_df_China_USA = create_network(cleaned_mostimp_text_China_USA)

In [56]:
print('China:')
print(network_df_China.iloc[0:30])
print()
print('USA:')
print(network_df_USA.iloc[0:30])
print()
print('China&USA:')
print(network_df_China_USA.iloc[0:30])
print()

China:
                         weight
(china, novel)              493
(confirm, case)             323
(china, outbreak)           296
(china, coronavirus)        291
(novel, coronavirus)        280
(patient, hospital)         265
(new, case)                 256
(china, case)               236
(novel, outbreak)           226
(china, fight)              226
(report, case)              212
(death, case)               191
(say, china)                169
(china, new)                164
(new, confirm)              155
(china, support)            147
(china, confirm)            139
(report, new)               135
(case, novel)               132
(china, epidemic)           127
(case, coronavirus)         126
(death, new)                125
(fight, novel)              125
(hospital, novel)           122
(china, hospital)           120
(province, case)            119
(report, confirm)           119
(coronavirus, outbreak)     119
(discharge, hospital)       117
(china, battle)             117



In [58]:
G_China = get_graph(network_China)
G_USA = get_graph(network_USA)
G_China_USA = get_graph(network_China_USA)

In [59]:
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()
print('China&USA:')
print('Nodes: ',len(G_China_USA.nodes()))
print('Edges: ',len(G_China_USA.edges()))
print('Is connected: ',nx.is_connected(G_China_USA))

China:
Nodes:  467
Edges:  28897
Is connected:  True

USA:
Nodes:  467
Edges:  27827
Is connected:  True

China&USA:
Nodes:  460
Edges:  40466
Is connected:  True


#Save edge list

In [60]:
filename = './edgelist_China'+period+'.csv'
nx.write_weighted_edgelist(G_China, filename, delimiter=",")
#add header with appropriate column names (works on colab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist_China_JanFeb2020.csv
files.download('edgelist_China'+period+'.csv')

filename = './edgelist_USA'+period+'.csv'
nx.write_weighted_edgelist(G_USA, filename, delimiter=",")
#add header with appropriate column names (works on colab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist_USA_JanFeb2020.csv
files.download('edgelist_USA'+period+'.csv')

filename = './edgelist_China_USA'+period+'.csv'
nx.write_weighted_edgelist(G_China_USA, filename, delimiter=",")
#add header with appropriate column names (works on colab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist_China_USA_JanFeb2020.csv
files.download('edgelist_China_USA'+period+'.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>





# Create Node List




In [61]:
def nodes(freq_dict, name):
  word_nodes = pd.DataFrame.from_dict(freq_dict,orient="index")
  word_nodes.reset_index(inplace=True)
  word_nodes["Label"] = word_nodes["index"]
  word_nodes.rename(columns={"index":"Id",0:"delete"},inplace=True)
  word_nodes = word_nodes.drop(columns=['delete'])
  nodelist = pd.DataFrame()
  nodelist = nodelist.append(word_nodes, ignore_index=True)

  nodelist = nodelist.to_csv("nodelist_"+name+".csv",index=False)
  files.download("nodelist_"+name+".csv")
  return nodelist, word_nodes

In [62]:
nodelist_China, word_nodes_China = nodes(freq_dict_China,'China'+period)
nodelist_USA, word_nodes_USA = nodes(freq_dict_USA,'USA_'+period)
nodelist_China_USA, word_nodes_China_USA = nodes(freq_dict_China_USA,'China_USA'+period)

print('China:')
print(word_nodes_China.head())
print()
print('USA:')
print(word_nodes_USA.head())
print()
print('China&USA:')
print(word_nodes_China_USA.head())
print()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

China:
            Id        Label
0        china        china
1        novel        novel
2  coronavirus  coronavirus
3         case         case
4     outbreak     outbreak

USA:
            Id        Label
0  coronavirus  coronavirus
1        china        china
2     outbreak     outbreak
3         case         case
4          say          say

China&USA:
            Id        Label
0  coronavirus  coronavirus
1        china        china
2     outbreak     outbreak
3         case         case
4        novel        novel

