In [1]:
#notebook to download the csv of edges and nodes of a given network
import os
import requests 
import time
import string
import networkx as nx
import itertools
import pandas as pd
import json
import re
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

from nltk.corpus import wordnet as wn #importing it
from nltk.stem.wordnet import WordNetLemmatizer #importing wordnet lemmatizer
from nltk import pos_tag #part-of-speech-tagger
from collections import defaultdict #defaultdict returns default value for non-existant keys you try to  access based on the function you passed in the constructor
from google.colab import files

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def extract_text(df):       #extract the text from the tweets and RT
                            #works ONLY on .csv file
  list_strings = []
  for index in range(len(df)):
    if index % 1000 == 0:
      print(str(index)+' / '+str(len(df)))
    text = df.loc[index]['text']                          #if it is nor trucated nor a RT  i take "text"
    string = -1
    if (df.loc[index,"truncated"] == True):                 #if it is trucated I take "extended_tweet"
        string = df.loc[index,"extended_tweet"]
    if type(df.loc[index,"retweeted_status"]) != float:     #if it is a RT I take retweeted_status
        string = df.loc[index,"retweeted_status"]
    if type(string) == str :
        if(re.search('full_text\':(.+?)https',string) != None):     #if I find "full_text"
          s = re.search('full_text\':(.+?)https',string).group(1)
        if(re.search('text\':(.+?)https',string)!= None):
          s = re.search('text\':(.+?)https',string).group(1)
        else: 
          continue
        list_strings.append(s)
        #print(s)         
    else:
      list_strings.append(text)
      #print(text)
      

  return list_strings

In [3]:
# Cleaning, lemmatising and pos tagging tweets

nltk.download('words')
WORDS = set(nltk.corpus.words.words()) #the last two lines serve to download the corpus of standard English language words
nltk.download('stopwords') #downloading stopwords
STOP_WORDS = set(nltk.corpus.stopwords.words("english")) #taking the stop words from English language
nltk.download('wordnet') #downloading wordnet
nltk.download('averaged_perceptron_tagger') #downloading tagger
tag_map = defaultdict(lambda : wn.NOUN) #here we define that wn.NOUN is the default value for the dict
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def lemma_pos_cleaner(tweet):

    tweet = re.sub("@[A-Za-z0-9]+","",tweet) # remove mentions
    tweet = re.sub("#[A-Za-z0-9]+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    tweet = " ".join(tweet.split())
    tweet = str.lower(tweet) #to lowercase 
    tweet = re.sub("'"," ",tweet) # remove aphostrophe

    #basically we use pos_tag function on tokens that we get by applying wordpunct tokenization
    #to tweet (it separates all the words and symbols)
    #then we pass the token along with it's wordnet pos value that we get from the tag_map dictionary (noun, adjective, verb or adverb) to the lemma function (the WordNetLemmatizer())
    lemma_function = WordNetLemmatizer()
    tweet = " ".join(lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(tweet))) #lemmatize
  

    # francesco: I removed also all 2 letters words and added specific words, words that appears frequently but are discarded because they are not in the english language
    SPECIFIC_WORDS = ['virus', 'coronavirus', 'covid19', 'covid', 'trump', 'hubei', 'beijing', 'xinjiang', 'jinping', 'korea', 'xinhua', 'india', 'taiwan','johnson','singapore', 'africa', 'japanese', 'france', 'asian', 'australia', 'french', 'asia', 'leishenshan', 'british', 'qingdao', 'fauci', 'america',  'california', 'sichuan', 'malaysia', 'huawei','thailand', 'shandong', 'italy', 'philippines', 'germany', 'facebook', 'african', 'shenzhen', 'tokyo', 'russian','uygur', '5g', 'pompeo', 'vietnam', 'australian', 'cambodia', 'zhejiang', 'yunnan', 'guangdong', 'korean', 'iran', 'washington']
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) if (w in WORDS or w in SPECIFIC_WORDS) and len(w)>2 and w not in STOP_WORDS ) #remove stop words
   
    return tweet

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
def frequency_dictionary(df):
  unique_words = {}

  for row in df:
    for word in row.split():
      #if the word is encountered for the first time add to dict as key and set its value to 0
      unique_words.setdefault(word,0)
      #increase the value (i.e the count) of the word by 1 every time it is encountered
      unique_words[word] += 1

  return unique_words

In [50]:
# discard less frequent words
def clean_words(cleaned_text, unuseful_words):
  re_cleaned_text = cleaned_text.copy()
  for txt in range(len(re_cleaned_text)):
    if txt % 2000 == 0:
      print(txt, '/',len(re_cleaned_text))
    w = re_cleaned_text[txt].split()
    for word in unuseful_words:
      while word in w:
        w.remove(word)
    re_cleaned_text[txt] = ' '.join(w)
  return re_cleaned_text

In [63]:
period = '_MarchApril2021'  # '', '_JanFeb2020', '_MarchApril2021', '_SeptOct2020'

In [64]:
China = pd.read_csv('/content/China'+period+'.csv')
USA = pd.read_csv('/content/USA'+period+'.csv')
China_USA = pd.read_csv('/content/China&USA'+period+'.csv')

In [65]:
# number of tweets:
print('China: ', len(China))
print('USA: ', len(USA))
print('China&USA: ', len(China_USA))

China:  1330
USA:  3932
China&USA:  5262


In [66]:
text_China = extract_text(China)
text_USA = extract_text(USA)
text_China_USA = extract_text(China_USA)

0 / 1330
1000 / 1330
0 / 3932
1000 / 3932
2000 / 3932
3000 / 3932
0 / 5262
1000 / 5262
2000 / 5262
3000 / 5262
4000 / 5262
5000 / 5262


In [67]:
cleaned_text_China = [lemma_pos_cleaner(txt) for txt in text_China]
cleaned_text_USA = [lemma_pos_cleaner(txt) for txt in text_USA]
cleaned_text_China_USA = [lemma_pos_cleaner(txt) for txt in text_China_USA]

print('China:')
print(cleaned_text_China[0:10])
print()
print('USA:')
print(cleaned_text_USA[0:10])
print()
print('China&USA:')
print(cleaned_text_China_USA[0:10])

China:
['death beloved rapper protest killing black america anguish lose lost one covid even historic celebration america pick photo week', 'german chancellor receive first dose covid vaccine announce via government spokesman people daily', 'number suicide drop study find among black', 'total covid jab give far close vaccine administer hour period accord government data', 'would like see covid passport infringe people freedom health passport claim', 'lift ban flight continue easing covid restriction', 'covid pandemic result permanent closure around extra business compare', 'german chancellor receive first dose vaccine', 'report new confirm case', 'million receive least one dose covid vaccine drug develop']

USA:
['decoration fill street old city welcome holy month freer covid', 'migrant worker pile rail station india head back home village covid curb', 'health economics education world change since covid', 'decoration fill street old city welcome holy month freer covid restriction heig

In [68]:
freq_dict_China = frequency_dictionary(cleaned_text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(cleaned_text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_China_USA = frequency_dictionary(cleaned_text_China_USA)
freq_dict_China_USA = dict(sorted(freq_dict_China_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

# number of words in the cleaned tweets:
print('China: ', len(list(freq_dict_China)))
print('USA: ', len(list(freq_dict_USA)))
print('China&USA: ', len(list(freq_dict_China_USA)))

China:  1961
USA:  3330
China&USA:  3840


In [69]:
# Most frequent words
print('China')
print([key for key in freq_dict_China.keys() if freq_dict_China[key]>100])
print()
print('USA')
print([key for key in freq_dict_USA.keys() if freq_dict_USA[key]>100])
print()
print('China&USA')
print([key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]>200])

China
['covid', 'vaccine', 'china', 'say', 'case', 'country', 'receive', 'health', 'report', 'first', 'million', 'new', 'dos', 'people', 'pandemic', 'batch']

USA
['covid', 'vaccine', 'say', 'new', 'case', 'johnson', 'shot', 'million', 'use', 'report', 'people', 'get', 'india', 'country', 'first', 'health', 'dos', 'blood', 'vaccination', 'clot', 'death', 'coronavirus', 'pandemic', 'state', 'year', 'brazil', 'surge', 'president', 'world', 'infection', 'day', 'risk', 'record', 'trial', 'may', 'china', 'receive', 'official', 'rise', 'month', 'variant', 'vaccinate', 'high', 'study', 'plan', 'one', 'test', 'need', 'rare', 'week', 'make', 'could', 'amid']

China&USA
['covid', 'vaccine', 'say', 'new', 'case', 'china', 'million', 'report', 'country', 'first', 'health', 'people', 'johnson', 'dos', 'use', 'pandemic', 'shot', 'receive', 'get', 'vaccination', 'death', 'state', 'blood', 'coronavirus', 'india', 'clot', 'year', 'brazil', 'president', 'world', 'day']


# less frequent words: not performed on data with only covid, coronavirus, vaccine keys

In [32]:
# less frequent words:
thr = 2
print('Less frequent China: ', len([key for key in freq_dict_China.keys() if freq_dict_China[key]<thr]))
print('More frequent China: ', len([key for key in freq_dict_China.keys() if freq_dict_China[key]>=thr]))
less_frequent_words_China = [key for key in freq_dict_China.keys() if freq_dict_China[key]<thr]
print()
print('Less frequent USA: ', len([key for key in freq_dict_USA.keys() if freq_dict_USA[key]<thr]))
print('More frequent USA:', len([key for key in freq_dict_USA.keys() if freq_dict_USA[key]>=thr]))
less_frequent_words_USA = [key for key in freq_dict_USA.keys() if freq_dict_USA[key]<thr]
print()
print('Less frequent China&USA: ', len([key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]<thr]))
print('More frequent China&USA: ', len([key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]>=thr]))
less_frequent_words_China_USA = [key for key in freq_dict_China_USA.keys() if freq_dict_China_USA[key]<thr]

Less frequent China:  1132
More frequent China:  1761

Less frequent USA:  820
More frequent USA: 1920

Less frequent China&USA:  1288
More frequent China&USA:  2658


In [None]:
# discard less frequent words
def clean_words(cleaned_text, unuseful_words):
  re_cleaned_text = cleaned_text.copy()
  for txt in range(len(re_cleaned_text)):
    if txt % 2000 == 0:
      print(txt, '/',len(re_cleaned_text))
    w = re_cleaned_text[txt].split()
    for word in unuseful_words:
      while word in w:
        w.remove(word)
    re_cleaned_text[txt] = ' '.join(w)
  return re_cleaned_text

In [None]:
cleaned_mostfreq_text_China = clean_words(cleaned_text_China, less_frequent_words_China)
cleaned_mostfreq_text_USA = clean_words(cleaned_text_USA, less_frequent_words_USA)
cleaned_mostfreq_text_China_USA = clean_words(cleaned_text_China_USA, less_frequent_words_China_USA)

0 / 2438
2000 / 2438
0 / 3185
2000 / 3185
0 / 5623
2000 / 5623
4000 / 5623


In [None]:
freq_dict_China = frequency_dictionary(cleaned_mostfreq_text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(cleaned_mostfreq_text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_China_USA = frequency_dictionary(cleaned_mostfreq_text_China_USA)
freq_dict_China_USA = dict(sorted(freq_dict_China_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

# number of words in the cleaned tweets:
print('China: ', len(list(freq_dict_China)))
print('USA: ', len(list(freq_dict_USA)))
print('China&USA: ', len(list(freq_dict_China_USA)))

China:  497
USA:  576
China&USA:  941


# Build Network

In [70]:
df_China = pd.DataFrame.from_dict(freq_dict_China, orient='index').reset_index()
df_China.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China.sort_values(by=['Count'], ascending=False, inplace=True)
df_China.reset_index(inplace=True)
df_China.drop(columns="index",inplace=True)

df_USA = pd.DataFrame.from_dict(freq_dict_USA, orient='index').reset_index()
df_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_USA.reset_index(inplace=True)
df_USA.drop(columns="index",inplace=True)

df_China_USA = pd.DataFrame.from_dict(freq_dict_China_USA, orient='index').reset_index()
df_China_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_China_USA.reset_index(inplace=True)
df_China_USA.drop(columns="index",inplace=True)

print('China')
print(df_China.iloc[0:30])
print()
print('USA')
print(df_USA.iloc[0:30])
print()
print('China&USA')
print(df_China_USA.iloc[0:30])
print()

China
           Word  Count
0         covid    942
1       vaccine    757
2         china    313
3           say    175
4          case    166
5       country    162
6       receive    159
7        health    156
8        report    156
9         first    155
10      million    149
11          new    148
12          dos    122
13       people    117
14     pandemic    109
15        batch    105
16    president     77
17        world     77
18         dose     71
19          use     69
20       accord     68
21     national     67
22          day     64
23          one     63
24       brazil     63
25  vaccination     63
26         year     62
27        death     61
28       second     59
29   administer     58

USA
           Word  Count
0         covid   3006
1       vaccine   2114
2           say    869
3           new    412
4          case    392
5       johnson    298
6          shot    258
7       million    257
8           use    249
9        report    246
10       people    246


In [71]:
keys_China = freq_dict_China.keys()  
keys_USA = freq_dict_USA.keys()  
keys_China_USA = freq_dict_China_USA.keys()  

In [72]:
def create_network(cleaned_text):
  network = {}
  #connect the word that appear in the same tweets
  for row in cleaned_text:
    combined_list = [word for word in str.split(row)]
    #for pair in itertools.product(combined_list, combined_list):
    #print(combined_list)
    for pair in itertools.product(combined_list, combined_list):
          #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
          if pair[0]!=pair[1] and not(pair[::-1] in network):
              network.setdefault(pair,0)
              network[pair] += 1 
  network_df = pd.DataFrame.from_dict(network, orient="index")
  network_df.columns = ["weight"]
  network_df.sort_values(by="weight",inplace=True, ascending=False)
  return network, network_df

In [73]:
network_China, network_df_China = create_network(cleaned_text_China)
network_USA, network_df_USA = create_network(cleaned_text_USA)
network_China_USA, network_df_China_USA = create_network(cleaned_text_China_USA)

In [74]:
print('China:')
print(network_df_China.iloc[0:30])
print()
print('USA:')
print(network_df_USA.iloc[0:30])
print()
print('China&USA:')
print(network_df_China_USA.iloc[0:30])
print()

China:
                    weight
(covid, vaccine)       524
(vaccine, china)       254
(china, covid)         243
(receive, vaccine)     171
(covid, case)          140
(million, covid)       140
(country, vaccine)     134
(dos, vaccine)         134
(vaccine, say)         132
(covid, say)           126
(covid, health)        126
(first, vaccine)       125
(covid, country)       124
(new, covid)           124
(first, covid)         119
(receive, covid)       118
(covid, report)        114
(million, vaccine)     112
(batch, vaccine)       109
(dos, covid)           104
(vaccine, health)      103
(covid, pandemic)      103
(new, case)             97
(covid, people)         92
(batch, covid)          90
(million, dos)          89
(use, vaccine)          82
(report, case)          77
(dose, vaccine)         73
(vaccine, people)       68

USA:
                      weight
(covid, vaccine)        1450
(covid, say)             641
(vaccine, say)           626
(new, covid)             362
(covi

#Graph


In [75]:
def get_graph(network):
  G = nx.Graph()
  for edge in network:
      G.add_edge(edge[0], edge[1], weight=network[edge])
  return G

In [76]:
G_China = get_graph(network_China)
G_USA = get_graph(network_USA)
G_China_USA = get_graph(network_China_USA)

In [77]:
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()
print('China&USA:')
print('Nodes: ',len(G_China_USA.nodes()))
print('Edges: ',len(G_China_USA.edges()))
print('Is connected: ',nx.is_connected(G_China_USA))

China:
Nodes:  1960
Edges:  40024
Is connected:  True

USA:
Nodes:  3330
Edges:  98919
Is connected:  True

China&USA:
Nodes:  3839
Edges:  126667
Is connected:  True


# PageRank

In [78]:
# Calculating the pagerank on graph G, teleportation probability here is 0.15 but since the graph is strongly connected we can set it to zero if we want
pr_China = nx.algorithms.pagerank(G_China,alpha = 1)
pr_China = dict(sorted(pr_China.items(), key=lambda item: item[1],reverse  = True))

pr_USA = nx.algorithms.pagerank(G_USA,alpha = 1)
pr_USA = dict(sorted(pr_USA.items(), key=lambda item: item[1],reverse  = True))

pr_China_USA = nx.algorithms.pagerank(G_China_USA,alpha = 1)
pr_China_USA = dict(sorted(pr_China_USA.items(), key=lambda item: item[1],reverse  = True))

In [79]:
def threshold(vector,threshold):

  l = [(el,vector[el]) for el in vector if vector[el] >= threshold ]

  return pd.DataFrame(l)

In [80]:
def threshold_reverse(vector,threshold):

  l = [(el,vector[el]) for el in vector if vector[el] < threshold ]

  return pd.DataFrame(l)

In [81]:
thr = 0.0004
print('China: ', len(threshold(pr_China,thr)))
print()
print(threshold(pr_China,thr).iloc[:30])
print()
print('USA: ', len(threshold(pr_USA,thr)))
print()
print(threshold(pr_USA,thr).iloc[:30])
print()
print('China&USA: ', len(threshold(pr_China_USA,thr)))
print()
print(threshold(pr_China_USA,thr).iloc[:30])

China:  480

              0         1
0         covid  0.059567
1       vaccine  0.045744
2         china  0.020886
3           say  0.014056
4       country  0.012877
5        health  0.012334
6          case  0.010848
7        report  0.010766
8         first  0.010181
9       million  0.009981
10      receive  0.009459
11          new  0.009430
12       people  0.008845
13          dos  0.008282
14     pandemic  0.007880
15        world  0.005986
16       accord  0.005306
17        batch  0.005286
18          day  0.005094
19    president  0.005074
20          use  0.005050
21     national  0.005036
22         year  0.004926
23       brazil  0.004622
24  vaccination  0.004621
25        death  0.004500
26         city  0.004445
27        state  0.004341
28         dose  0.004290
29          one  0.004040

USA:  512

              0         1
0         covid  0.057114
1       vaccine  0.041153
2           say  0.020410
3           new  0.008660
4          case  0.008049
5       johns

# TF-IDF: not performed

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,1))   # ngram range can be changed to obtain measures regarding n grams instead of single words

X_China = tfidf.fit_transform(cleaned_text_China).toarray()    # entry (i,j) if Tfidf measure of word_list[j] in document i
word_list_China = tfidf.get_feature_names_out()

X_USA = tfidf.fit_transform(cleaned_text_USA).toarray()
word_list_USA = tfidf.get_feature_names_out()

X_China_USA = tfidf.fit_transform(cleaned_text_China_USA).toarray()
word_list_China_USA = tfidf.get_feature_names_out()


In [None]:
tfidf_df_China = pd.DataFrame(X_China,columns = word_list_China)

tfidf_df_USA = pd.DataFrame(X_USA,columns = word_list_USA)

tfidf_df_China_USA = pd.DataFrame(X_China_USA,columns = word_list_China_USA)

In [None]:
tfidf_word_measure_China = np.mean(tfidf_df_China,axis = 0)
tfidf_word_measure_China = tfidf_word_measure_China.sort_values(ascending = False)
tfidf_word_measure_USA = np.mean(tfidf_df_USA,axis = 0)
tfidf_word_measure_USA = tfidf_word_measure_USA.sort_values(ascending = False)
tfidf_word_measure_China_USA = np.mean(tfidf_df_China_USA,axis = 0)
tfidf_word_measure_China_USA = tfidf_word_measure_China_USA.sort_values(ascending = False)

In [None]:
print('China:')
print(tfidf_word_measure_China[0:30])
print()
print('USA:')
print(tfidf_word_measure_USA[0:30])
print()
print('China&USA:')
print(tfidf_word_measure_China_USA[0:30])
print()

China:
covid          0.076069
watch          0.059429
coronavirus    0.047427
case           0.044577
china          0.037314
say            0.037011
new            0.035957
test           0.032966
report         0.031964
president      0.029373
pandemic       0.029153
vaccine        0.026741
trump          0.026549
update         0.026117
health         0.025600
country        0.024406
positive       0.023640
million        0.020711
world          0.020349
people         0.020237
death          0.019359
accord         0.017701
late           0.017458
sept           0.016890
year           0.016416
global         0.016072
day            0.015739
number         0.015450
national       0.014457
infection      0.014375
dtype: float64

USA:
covid          0.063696
coronavirus    0.043593
case           0.032740
say            0.031375
new            0.030973
vaccine        0.029819
trump          0.026958
test           0.026622
president      0.021916
report         0.021543
positive    

# reduced graph

In [82]:
# less important words:
less_important_words_China = list(threshold_reverse(pr_China,thr)[0])

less_important_words_USA = list(threshold_reverse(pr_USA,thr)[0])

less_important_words_China_USA = list(threshold_reverse(pr_China_USA,thr)[0])

In [83]:
cleaned_mostimp_text_China = clean_words(cleaned_text_China,less_important_words_China)
cleaned_mostimp_text_USA = clean_words(cleaned_text_USA,less_important_words_USA)
cleaned_mostimp_text_China_USA = clean_words(cleaned_text_China_USA,less_important_words_China_USA)

0 / 1294
0 / 3932
2000 / 3932
0 / 5226
2000 / 5226
4000 / 5226


In [84]:
freq_dict_China = frequency_dictionary(cleaned_mostimp_text_China)
freq_dict_China = dict(sorted(freq_dict_China.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_USA = frequency_dictionary(cleaned_mostimp_text_USA)
freq_dict_USA = dict(sorted(freq_dict_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

freq_dict_China_USA = frequency_dictionary(cleaned_mostimp_text_China_USA)
freq_dict_China_USA = dict(sorted(freq_dict_China_USA.items(), key=lambda item: item[1], reverse = True))   #order from more frequent to less frequent word

# number of words in the cleaned tweets:
print('China: ', len(list(freq_dict_China)))
print('USA: ', len(list(freq_dict_USA)))
print('China&USA: ', len(list(freq_dict_China_USA)))

China:  481
USA:  512
China&USA:  502


In [85]:
df_China = pd.DataFrame.from_dict(freq_dict_China, orient='index').reset_index()
df_China.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China.sort_values(by=['Count'], ascending=False, inplace=True)
df_China.reset_index(inplace=True)
df_China.drop(columns="index",inplace=True)

df_USA = pd.DataFrame.from_dict(freq_dict_USA, orient='index').reset_index()
df_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_USA.reset_index(inplace=True)
df_USA.drop(columns="index",inplace=True)

df_China_USA = pd.DataFrame.from_dict(freq_dict_China_USA, orient='index').reset_index()
df_China_USA.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
df_China_USA.sort_values(by=['Count'], ascending=False, inplace=True)
df_China_USA.reset_index(inplace=True)
df_China_USA.drop(columns="index",inplace=True)

print('China')
print(df_China.iloc[0:30])
print()
print('USA')
print(df_USA.iloc[0:30])
print()
print('China&USA')
print(df_China_USA.iloc[0:30])
print()

China
           Word  Count
0         covid    942
1       vaccine    757
2         china    313
3           say    175
4          case    166
5       country    162
6       receive    159
7        health    156
8        report    156
9         first    155
10      million    149
11          new    148
12          dos    122
13       people    117
14     pandemic    109
15        batch    105
16    president     77
17        world     77
18         dose     71
19          use     69
20       accord     68
21     national     67
22          day     64
23          one     63
24       brazil     63
25  vaccination     63
26         year     62
27        death     61
28       second     59
29   administer     58

USA
           Word  Count
0         covid   3006
1       vaccine   2114
2           say    869
3           new    412
4          case    392
5       johnson    298
6          shot    258
7       million    257
8           use    249
9        report    246
10       people    246


In [86]:
keys_China = freq_dict_China.keys()  
keys_USA = freq_dict_USA.keys()  
keys_China_USA = freq_dict_China_USA.keys()  

In [87]:
network_China, network_df_China = create_network(cleaned_mostimp_text_China)
network_USA, network_df_USA = create_network(cleaned_mostimp_text_USA)
network_China_USA, network_df_China_USA = create_network(cleaned_mostimp_text_China_USA)

In [88]:
print('China:')
print(network_df_China.iloc[0:30])
print()
print('USA:')
print(network_df_USA.iloc[0:30])
print()
print('China&USA:')
print(network_df_China_USA.iloc[0:30])
print()

China:
                    weight
(covid, vaccine)       524
(vaccine, china)       254
(china, covid)         243
(receive, vaccine)     171
(covid, case)          140
(million, covid)       140
(country, vaccine)     134
(dos, vaccine)         134
(vaccine, say)         132
(covid, health)        126
(covid, say)           126
(first, vaccine)       125
(new, covid)           124
(covid, country)       124
(first, covid)         119
(receive, covid)       118
(covid, report)        114
(million, vaccine)     112
(batch, vaccine)       109
(dos, covid)           104
(vaccine, health)      103
(covid, pandemic)      103
(new, case)             97
(covid, people)         92
(batch, covid)          90
(million, dos)          89
(use, vaccine)          82
(report, case)          77
(dose, vaccine)         73
(vaccine, people)       68

USA:
                      weight
(covid, vaccine)        1450
(covid, say)             641
(vaccine, say)           626
(new, covid)             362
(covi

In [89]:
G_China = get_graph(network_China)
G_USA = get_graph(network_USA)
G_China_USA = get_graph(network_China_USA)

In [90]:
print('China:')
print('Nodes: ',len(G_China.nodes()))
print('Edges: ',len(G_China.edges()))
print('Is connected: ',nx.is_connected(G_China))
print()
print('USA:')
print('Nodes: ',len(G_USA.nodes()))
print('Edges: ',len(G_USA.edges()))
print('Is connected: ',nx.is_connected(G_USA))
print()
print('China&USA:')
print('Nodes: ',len(G_China_USA.nodes()))
print('Edges: ',len(G_China_USA.edges()))
print('Is connected: ',nx.is_connected(G_China_USA))

China:
Nodes:  480
Edges:  17152
Is connected:  True

USA:
Nodes:  512
Edges:  34347
Is connected:  True

China&USA:
Nodes:  501
Edges:  39757
Is connected:  True


#Save edge list

In [92]:
filename = './edgelist_China'+period+'.csv'
nx.write_weighted_edgelist(G_China, filename, delimiter=",")
#add header with appropriate column names (works on colab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist_China__MarchApril2021.csv
files.download('edgelist_China'+period+'.csv')

filename = './edgelist_USA'+period+'.csv'
nx.write_weighted_edgelist(G_USA, filename, delimiter=",")
#add header with appropriate column names (works on colab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist_USA__MarchApril2021.csv
files.download('edgelist_USA'+period+'.csv')

filename = './edgelist_China_USA'+period+'.csv'
nx.write_weighted_edgelist(G_China_USA, filename, delimiter=",")
#add header with appropriate column names (works on colab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist_China_USA__MarchApril2021.csv
files.download('edgelist_China_USA'+period+'.csv')

sed: can't read ./edgelist_China__MarchApril2021.csv: No such file or directory


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

sed: can't read ./edgelist_USA__MarchApril2021.csv: No such file or directory


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

sed: can't read ./edgelist_China_USA__MarchApril2021.csv: No such file or directory


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>





# Create Node List




In [61]:
def nodes(freq_dict, name):
  word_nodes = pd.DataFrame.from_dict(freq_dict,orient="index")
  word_nodes.reset_index(inplace=True)
  word_nodes["Label"] = word_nodes["index"]
  word_nodes.rename(columns={"index":"Id",0:"delete"},inplace=True)
  word_nodes = word_nodes.drop(columns=['delete'])
  nodelist = pd.DataFrame()
  nodelist = nodelist.append(word_nodes, ignore_index=True)

  nodelist = nodelist.to_csv("nodelist_"+name+".csv",index=False)
  files.download("nodelist_"+name+".csv")
  return nodelist, word_nodes

In [93]:
nodelist_China, word_nodes_China = nodes(freq_dict_China,'China'+period)
nodelist_USA, word_nodes_USA = nodes(freq_dict_USA,'USA_'+period)
nodelist_China_USA, word_nodes_China_USA = nodes(freq_dict_China_USA,'China_USA'+period)

print('China:')
print(word_nodes_China.head())
print()
print('USA:')
print(word_nodes_USA.head())
print()
print('China&USA:')
print(word_nodes_China_USA.head())
print()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

China:
        Id    Label
0    covid    covid
1  vaccine  vaccine
2    china    china
3      say      say
4     case     case

USA:
        Id    Label
0    covid    covid
1  vaccine  vaccine
2      say      say
3      new      new
4     case     case

China&USA:
        Id    Label
0    covid    covid
1  vaccine  vaccine
2      say      say
3      new      new
4     case     case

