In [3]:
import pandas as pd
import spacy
from collections import Counter
import time
import pickle
import multiprocessing as mp
from multiprocessing import Pool, Process

# from functions import read_sort_save, replace_content, process_text, split, wordcounts_clean

# PART 1 - Preprocessing

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
stopwords = nlp.Defaults.stop_words

In [6]:
print(len(stopwords))
print(stopwords)

326
{'or', 'nine', 'here', 'more', 'we', 'ever', "'s", 'enough', 'when', 'behind', 'five', 'however', 'whereafter', 'another', 'none', 'cannot', 'move', 'anywhere', 'only', 'everything', 'for', 'without', 'well', 'anyway', 'very', 'anything', 'ourselves', 'forty', 'with', "'re", 'hers', 'some', '‘d', 'why', 'most', 'own', 'formerly', 'can', 'moreover', 'name', "'ve", 'down', 'part', 'throughout', 'using', 'either', 'is', 'onto', 'whole', 'never', 'somehow', 'give', 'so', 'mine', 'call', 'neither', 'thru', 'therefore', '’ll', 'us', 'many', 'see', 'hereby', 'ours', 'an', 'anyone', 'its', 'our', 'bottom', 'into', '’s', 'even', 'whatever', 'she', 'itself', 'serious', 'say', 'keep', 'was', 'any', 'almost', 'per', 'seeming', 'a', 'among', 'go', 'ten', "'d", 'who', 'already', 'since', 'thereupon', "'m", 'doing', 'front', 'themselves', 'quite', 'put', 'will', 'latter', 'whereupon', 'this', 'noone', 'elsewhere', 'seems', 'until', 'eight', 'it', 'above', 'along', '‘re', 'side', 'while', 'whereve

In [7]:
redict = {
"are n't" : "aren't",   "ca n't" : "can't",     "could n't" : "couldn't",   "did n't" : "didn't",       "does n't" : "doesn't",\
"do n't" : "don't",     "had n't" : "hadn't",   "has n't" : "hasn't",       "have n't" : "haven't",     "h e'd" : "he'd",\
"h e'll" : "he'll",     "h e's" : "he's",       "i 'd" : "i'd",             "i 'll" : "i'll",           "i 'm" : "i'm",\
"i 've" : "i've",       "is n't" : "isn't",     "i t's" : "it's",           "le t's" : "let's",         "must n't" : "mustn't",\
"sha n't" : "shan't",   "sh e'd" : "she'd",     "sh e'll" : "she'll",       "sh e's" : "she's",         "should n't" : "shouldn't",\
"tha t's" : "that's",   "ther e's" : "there's", "the y'll" : "they'll",     "the y're" : "they're",     "the y've" : "they've",\
"w e'd" : "we'd",       "w e're" : "we're",     "w e've" : "we've",         "were n't" : "weren't",     "wha t'll" : "what'll",\
"wha t're" : "what're", "wha t's" : "what's",   "wha t've" : "what've",     "wher e's" : "where's",     "wh o'd" : "who'd",\
"wh o'll" : "who'll",   "wh o're" : "who're",   "wh o's" : "who's",         "wh o've" : "who've",       "wo n't" : "won't",\
"would n't" : "wouldn't","yo u'd" : "you'd",    "yo u'll" : "you'll",       "yo u're" : "you're",       "yo u've" : "you've",\
# " 's" : "'s",         # " 're": "'re",    
"new zealand" : "",     "<p>" : "",             "<h>" : "",                 " @ " : "",                 "@" : "",
"\n" : ""
}

In [8]:
## List of items to remove from word count dictionaries

In [9]:
remove_list= [
    "'s", "n't", "'re", "u", "ve", \
    "'", "''", "'m", "/", "'ll", \
    "*", "'d", "'ve", "m", "mr", \
    "ms", "dr", "mrs", "."
]

In [18]:
## Function based on dict, replaces key with the value on the target

In [19]:
def replace_content(dict_replace, target):
    for check, replacer in list(dict_replace.items()):
        target = target.replace(check, replacer)

    return target

In [20]:
## Function to read .csv file as pandas dataframe, sort values by year, and save as .pkl file

In [21]:
def read_sort_save(filename1, filename2):
    dataframe = pd.read_csv(filename1, usecols=['year', 'article_text_Ngram_stopword_lemmatize'])
    dataframe.sort_values(by=['year'], inplace=True, ignore_index=True)
    dataframe.to_pickle(filename2)

In [22]:
## Function to create Word Count Dictionary which filters out stopwords and named entitites

## First, parse through article and lowercase all words

## Next, make corrections to words with apostrophers using redict dictionary

## Append words from processed document to list with stopwords (wordfreq1) or list without stopwords (wordfreq2)

## Return counter dictionaries of each list, save each list (stopwords/nostopwords) as .pkl files



In [23]:
## READ AND UNDERSTAND HOW FUNCTION WORKS

In [24]:
def split(iteration, n):  
   
   quotient, remainder = divmod(len(iteration), n)

   split_data = [
      
      iteration[
         # FLOOR
         i * quotient + min(i, remainder)
         :
         # CEILING
         (i + 1) * quotient + min(i + 1, remainder)

         ] for i in range(6)
   ]

   split_data_dictionary = {x[0] : x[1] for x in enumerate(split_data)}
   
   return split_data_dictionary

In [25]:
def process_text(text, L1, L2):
    text_data = text.lower()
    new_text = replace_content(redict, text_data)
    document = nlp(new_text)
    for ent in document:
        if ent.ent_type:
            L1.append(ent.text)
        else:
            L2.append(ent.text)
    

In [26]:
def wordcounts(file1_input, file1_output, file2_output):
    
    L1 = []
    L2 = []  

    dataframe = pd.read_pickle(file1_input)

    sub_list = split(dataframe['article_text_Ngram_stopword_lemmatize'], 6)

    for i in sub_list:
        for j in sub_list[i]:
            process_text(j, L1, L2)    

    count_stopwords = Counter(word for word in L1)
    count_nostopwords = Counter(word for word in L1)

    with open(file1_output, 'wb') as handle:
        pickle.dump(count_stopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    with open(file2_output, 'wb') as handle:
        pickle.dump(count_nostopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
## After checking word count lists, clean for named entities and nlp terms from data using removedict and stopword lists

In [28]:
def wordcounts_clean(file1_input, file2_input, remove_list, stopword_list, file1_output, file2_output):

    stopwords_dataframe = pd.read_pickle(file1_input)
    nostopwords_dataframe = pd.read_pickle(file2_input)

    count_stopwords = dict(zip(stopwords_dataframe[0], stopwords_dataframe[1]))

    count_nostopwords = dict(zip(nostopwords_dataframe[0], nostopwords_dataframe[1]))

    for i in remove_list:
        if i in count_stopwords:
            del count_stopwords[i]
        if i in count_nostopwords:
            del count_nostopwords[i]

    for i in stopword_list:
        if i in count_stopwords:
            del count_stopwords[i]
        if i in count_nostopwords:
            del count_nostopwords[i]

    df_count_stopwords = pd.DataFrame(Counter(count_stopwords).most_common(500))
    df_count_nostopwords = pd.DataFrame(Counter(count_nostopwords).most_common(500))

    with open(file1_output, 'wb') as handle:
        pickle.dump(df_count_stopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    with open(file2_output, 'wb') as handle:
        pickle.dump(df_count_nostopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [73]:
def wordcounts_print_sample(dictionary):
    for i in enumerate(Counter(dictionary).most_common(300)):
        if i[0] < 20 or i[0] > 279:
            print(i)
        elif i[0] == (149):
            print("\n\t ... \n")

In [13]:
## Open and read excel files as pandas dataframes: cols = ['year', 'article_text_Ngram_stopword_lemmatize']
## Sort articles by year
## Save as .pkl file

## Note: Pakistan (PK) and South Africa (ZA) were omitted due to insufficient articles

In [None]:
## Australia AU
read_sort_save('AU_domestic_Ngram_stopword_lematize.csv', 'df_au.pkl')

In [None]:
wordcounts('df_au.pkl', 'df_counts_au1.pkl', 'df_counts_au2.pkl')

In [11]:
remove_items = ["'s", "n't", "'re", "u", "ve", "'", "''", "'m", "/", "'ll", "*", "'d", "'ve", "m"]

[remove_list.append(i) for i in remove_items if i not in remove_list]

[]

In [12]:
wordcounts_clean('df_counts_au1.pkl', 
               'df_counts_au2.pkl',
               remove_list,
               stopwords,
               'df_counts_au1.pkl',
               'df_counts_au2.pkl'
               )

In [75]:
df_counts_au_nostopwords = pd.read_pickle('df_counts_au2.pkl')

df_counts_au_nostopwords = dict(zip(df_counts_au_nostopwords[0], df_counts_au_nostopwords[1]))

wordcounts_print_sample(df_counts_au_nostopwords)

(0, ('time', 99918))
(1, ('people', 83357))
(2, ('like', 73990))
(3, ('new', 69815))
(4, ('work', 65096))
(5, ('use', 63113))
(6, ('come', 57492))
(7, ('need', 51243))
(8, ('government', 51129))
(9, ('look', 49322))
(10, ('think', 47609))
(11, ('know', 46152))
(12, ('way', 45194))
(13, ('include', 45178))
(14, ('game', 43195))
(15, ('good', 41740))
(16, ('want', 41397))
(17, ('high', 39172))
(18, ('market', 38863))
(19, ('world', 38654))

	 ... 

(280, ('condition', 10379))
(281, ('appear', 10360))
(282, ('league', 10360))
(283, ('understand', 10271))
(284, ('personal', 10251))
(285, ('campaign', 10240))
(286, ('benefit', 10196))
(287, ('target', 10172))
(288, ('labor', 10115))
(289, ('goal', 10113))
(290, ('hope', 10105))
(291, ('history', 10080))
(292, ('clear', 10078))
(293, ('gold', 10043))
(294, ('statement', 10021))
(295, ('note', 10012))
(296, ('land', 10003))
(297, ('film', 9998))
(298, ('example', 9953))
(299, ('bring', 9951))


In [None]:
## Bangladesh BD
read_sort_save('BD_domestic_Ngram_stopword_lematize.csv', 'df_bd.pkl')

In [None]:
wordcounts('df_bd.pkl', 'df_counts_bd1.pkl', 'df_counts_bd2.pkl')

In [None]:
remove_items = ['bangladeshi', 'sheikh', 'crore', \
                'tk', 'bnp', 'bangabandhu', 'bangladesh', \
               'rahman', 'hossain', 'hasina', 'upazila', \
               'bangabandhu', 'bangla', 'indian', 'myanmar', \
               'chittagong', 'khan', 'rohingya', 'chowdhury', 'prof'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [15]:
wordcounts_clean('df_counts_bd1.pkl', 
               'df_counts_bd2.pkl',
               remove_list,
               stopwords,
               'df_counts_bd1.pkl',
               'df_counts_bd2.pkl'
               )

In [72]:
df_counts_bd_nostopwords = pd.read_pickle('df_counts_bd2.pkl')

df_counts_bd_nostopwords = dict(zip(df_counts_bd_nostopwords[0], df_counts_bd_nostopwords[1]))

wordcounts_print_sample(df_counts_bd_nostopwords)

(0, ('country', 22510))
(1, ('government', 20966))
(2, ('people', 19966))
(3, ('minister', 12601))
(4, ('time', 12261))
(5, ('work', 11527))
(6, ('police', 10164))
(7, ('come', 9810))
(8, ('high', 9434))
(9, ('use', 8819))
(10, ('new', 8614))
(11, ('include', 8222))
(12, ('case', 8117))
(13, ('add', 8097))
(14, ('area', 7970))
(15, ('world', 7732))
(16, ('need', 7700))
(17, ('bank', 7421))
(18, ('official', 7324))
(19, ('report', 7164))

	 ... 

(280, ('initiative', 2090))
(281, ('control', 2089))
(282, ('award', 2085))
(283, ('coronavirus', 2084))
(284, ('tax', 2083))
(285, ('financial', 2075))
(286, ('raise', 2074))
(287, ('management', 2070))
(288, ('quality', 2067))
(289, ('effort', 2067))
(290, ('conduct', 2065))
(291, ('thing', 2057))
(292, ('recently', 2045))
(293, ('join', 2043))
(294, ('grow', 2041))
(295, ('conference', 2034))
(296, ('medical', 2033))
(297, ('address', 2033))
(298, ('field', 2032))
(299, ('complete', 2029))


In [None]:
## Canada CA
read_sort_save('CA_domestic_Ngram_stopword_lematize.csv', 'df_ca.pkl')

In [None]:
wordcounts('df_ca.pkl', 'df_counts_ca1.pkl', 'df_counts_ca2.pkl')

In [None]:
remove_items = ['ottowa', 'ontario', 'cbc', 'toolong', 'vancouver' ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [18]:
wordcounts_clean('df_counts_ca1.pkl', 
               'df_counts_ca2.pkl',
               remove_list,
               stopwords,
               'df_counts_ca1.pkl',
               'df_counts_ca2.pkl'
               )

In [74]:
df_counts_ca_nostopwords = pd.read_pickle('df_counts_ca2.pkl')

df_counts_ca_nostopwords = dict(zip(df_counts_ca_nostopwords[0], df_counts_ca_nostopwords[1]))

wordcounts_print_sample(df_counts_ca_nostopwords)

(0, ('time', 101831))
(1, ('people', 96695))
(2, ('use', 81684))
(3, ('new', 80842))
(4, ('like', 78735))
(5, ('work', 78459))
(6, ('come', 69959))
(7, ('company', 62410))
(8, ('include', 60010))
(9, ('game', 57024))
(10, ('look', 55700))
(11, ('know', 54572))
(12, ('want', 54483))
(13, ('right', 52872))
(14, ('information', 52481))
(15, ('comment', 52001))
(16, ('need', 51774))
(17, ('way', 50782))
(18, ('city', 50557))
(19, ('team', 49435))

	 ... 

(280, ('general', 12725))
(281, ('serve', 12710))
(282, ('involve', 12703))
(283, ('shot', 12681))
(284, ('reason', 12643))
(285, ('control', 12616))
(286, ('related', 12598))
(287, ('course', 12530))
(288, ('land', 12522))
(289, ('bank', 12466))
(290, ('strong', 12449))
(291, ('management', 12399))
(292, ('data', 12377))
(293, ('energy', 12353))
(294, ('access', 12342))
(295, ('list', 12297))
(296, ('growth', 12263))
(297, ('encourage', 12262))
(298, ('film', 12255))
(299, ('role', 12249))


In [None]:
## United Kingdom GB
read_sort_save('GB_domestic_Ngram_stopword_lematize.csv', 'df_gb.pkl')

In [None]:
wordcounts('df_gb.pkl', 'df_counts_gb1.pkl', 'df_counts_gb2.pkl')

In [None]:
remove_items = ['uk', 'trump']

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [21]:
wordcounts_clean('df_counts_gb1.pkl', 
               'df_counts_gb2.pkl',
               remove_list,
               stopwords,
               'df_counts_gb1.pkl',
               'df_counts_gb2.pkl'
               )

In [76]:
df_counts_gb_nostopwords = pd.read_pickle('df_counts_gb2.pkl')

df_counts_gb_nostopwords = dict(zip(df_counts_gb_nostopwords[0], df_counts_gb_nostopwords[1]))

wordcounts_print_sample(df_counts_gb_nostopwords)

(0, ('people', 48863))
(1, ('time', 48544))
(2, ('like', 39139))
(3, ('work', 38795))
(4, ('new', 37046))
(5, ('come', 33054))
(6, ('use', 32345))
(7, ('know', 25840))
(8, ('look', 24954))
(9, ('world', 24617))
(10, ('way', 24316))
(11, ('want', 24149))
(12, ('need', 23568))
(13, ('include', 22977))
(14, ('think', 21758))
(15, ('life', 21306))
(16, ('game', 20243))
(17, ('good', 19542))
(18, ('government', 18847))
(19, ('right', 18820))

	 ... 

(280, ('word', 5778))
(281, ('space', 5772))
(282, ('reason', 5762))
(283, ('style', 5703))
(284, ('possible', 5695))
(285, ('sure', 5693))
(286, ('fight', 5693))
(287, ('likely', 5691))
(288, ('black', 5685))
(289, ('room', 5679))
(290, ('student', 5664))
(291, ('chief', 5658))
(292, ('council', 5655))
(293, ('carry', 5629))
(294, ('fund', 5613))
(295, ('available', 5593))
(296, ('fire', 5589))
(297, ('performance', 5582))
(298, ('light', 5581))
(299, ('major', 5579))


In [None]:
## Ghana GH
read_sort_save('GH_domestic_Ngram_stopword_lematize.csv', 'df_gh.pkl')

In [None]:
wordcounts('df_gh.pkl', 'df_counts_gh1.pkl', 'df_counts_gh2.pkl')

In [None]:
remove_items = ['ghana', 'accra', 'ghanaian', 'ghanaians', 'npp', 'ndc', 'addo', 'nana',\
               'mahama', 'akufo', 'gh', 'kumasi', 'fm', 'prof'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [24]:
wordcounts_clean('df_counts_gh1.pkl', 
               'df_counts_gh2.pkl',
               remove_list,
               stopwords,
               'df_counts_gh1.pkl',
               'df_counts_gh2.pkl'
               )

In [77]:
df_counts_gh_nostopwords = pd.read_pickle('df_counts_gh2.pkl')

df_counts_gh_nostopwords = dict(zip(df_counts_gh_nostopwords[0], df_counts_gh_nostopwords[1]))

wordcounts_print_sample(df_counts_gh_nostopwords)

(0, ('country', 28303))
(1, ('government', 26010))
(2, ('people', 22788))
(3, ('president', 21453))
(4, ('time', 16691))
(5, ('come', 14847))
(6, ('new', 14518))
(7, ('work', 14492))
(8, ('use', 13881))
(9, ('national', 13612))
(10, ('service', 13458))
(11, ('state', 13292))
(12, ('need', 13132))
(13, ('party', 13013))
(14, ('company', 12639))
(15, ('development', 12539))
(16, ('school', 12535))
(17, ('public', 12129))
(18, ('know', 11194))
(19, ('region', 11045))

	 ... 

(280, ('death', 3135))
(281, ('large', 3121))
(282, ('building', 3118))
(283, ('vote', 3110))
(284, ('indicate', 3082))
(285, ('seek', 3078))
(286, ('association', 3077))
(287, ('individual', 3073))
(288, ('live', 3067))
(289, ('demand', 3062))
(290, ('peace', 3040))
(291, ('vice', 3038))
(292, ('establish', 3019))
(293, ('close', 3016))
(294, ('left', 3014))
(295, ('central', 3013))
(296, ('tell', 3010))
(297, ('candidate', 2988))
(298, ('farmer', 2983))
(299, ('senior', 2983))


In [25]:
# Note: Hong Kong appears to have much less data than other countries

In [25]:
## Hong Kong HK
read_sort_save('HK_domestic_Ngram_stopword_lematize.csv', 'df_hk.pkl')

In [None]:
wordcounts('df_hk.pkl', 'df_counts_hk1.pkl', 'df_counts_hk2.pkl')

In [None]:
remove_items = ['lt', 'gt', '/p', 'p', 'hk', 'class=', 'http', 'asia', 'lam', 'co', \
               'chan', 'p1', 'beijing', 'href=', 'wp'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [28]:
wordcounts_clean('df_counts_hk1.pkl', 
               'df_counts_hk2.pkl',
               remove_list,
               stopwords,
               'df_counts_hk1.pkl',
               'df_counts_hk2.pkl'
               )

In [78]:
df_counts_hk_nostopwords = pd.read_pickle('df_counts_hk2.pkl')

df_counts_hk_nostopwords = dict(zip(df_counts_hk_nostopwords[0], df_counts_hk_nostopwords[1]))

wordcounts_print_sample(df_counts_hk_nostopwords)

(0, ('amp', 3972))
(1, ('company', 3826))
(2, ('government', 3738))
(3, ('people', 3349))
(4, ('business', 3084))
(5, ('time', 3051))
(6, ('market', 2900))
(7, ('new', 2787))
(8, ('law', 2549))
(9, ('report', 2392))
(10, ('include', 2237))
(11, ('work', 2053))
(12, ('public', 1988))
(13, ('city', 1940))
(14, ('high', 1933))
(15, ('use', 1928))
(16, ('country', 1920))
(17, ('bank', 1790))
(18, ('group', 1746))
(19, ('come', 1723))

	 ... 

(280, ('able', 447))
(281, ('process', 447))
(282, ('form', 446))
(283, ('cause', 445))
(284, ('view', 444))
(285, ('seek', 443))
(286, ('role', 443))
(287, ('reason', 441))
(288, ('domestic', 441))
(289, ('remain', 441))
(290, ('range', 441))
(291, ('activity', 440))
(292, ('study', 439))
(293, ('charge', 439))
(294, ('sign', 439))
(295, ('despite', 438))
(296, ('body', 438))
(297, ('likely', 437))
(298, ('current', 436))
(299, ('general', 436))


In [None]:
## Ireland IE
read_sort_save('IE_domestic_Ngram_stopword_lematize.csv', 'df_ie.pkl')

In [None]:
wordcounts('df_ie.pkl', 'df_counts_ie1.pkl', 'df_counts_ie2.pkl')

In [None]:
remove_items = [
                'ireland', 'irish', 'dublin', 'cooky', 'limerick', 'co', 'galway', 'uk', 'derry', 'yea'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [31]:
wordcounts_clean('df_counts_ie1.pkl', 
               'df_counts_ie2.pkl',
               remove_list,
               stopwords,
               'df_counts_ie1.pkl',
               'df_counts_ie2.pkl'
               )

In [79]:
df_counts_ie_nostopwords = pd.read_pickle('df_counts_ie2.pkl')

df_counts_ie_nostopwords = dict(zip(df_counts_ie_nostopwords[0], df_counts_ie_nostopwords[1]))

wordcounts_print_sample(df_counts_ie_nostopwords)

(0, ('time', 77019))
(1, ('use', 72634))
(2, ('people', 61005))
(3, ('come', 50201))
(4, ('work', 48931))
(5, ('new', 47556))
(6, ('like', 42155))
(7, ('website', 37221))
(8, ('team', 32182))
(9, ('home', 32092))
(10, ('site', 31766))
(11, ('look', 31373))
(12, ('game', 29067))
(13, ('know', 29061))
(14, ('need', 28496))
(15, ('day', 28163))
(16, ('good', 28144))
(17, ('place', 28050))
(18, ('include', 27576))
(19, ('family', 27170))

	 ... 

(280, ('miss', 7790))
(281, ('range', 7776))
(282, ('option', 7753))
(283, ('drive', 7744))
(284, ('later', 7725))
(285, ('finish', 7722))
(286, ('film', 7719))
(287, ('hope', 7701))
(288, ('remember', 7687))
(289, ('raise', 7632))
(290, ('click', 7628))
(291, ('agree', 7623))
(292, ('control', 7617))
(293, ('grow', 7615))
(294, ('question', 7609))
(295, ('action', 7604))
(296, ('saw', 7593))
(297, ('claim', 7591))
(298, ('force', 7572))
(299, ('act', 7561))


In [None]:
## India IN
read_sort_save('IN_domestic_Ngram_stopword_lematize.csv', 'df_in.pkl')

In [None]:
wordcounts('df_in.pkl', 'df_counts_in1.pkl', 'df_counts_in2.pkl')

In [None]:
remove_items = [
               'rs', 'delhi', 'crore', 'ist', 'singh', 'gmt', 'bjp', 'modi', 'indi', 'lakh', \
               'pradesh', 'facebook'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [34]:
wordcounts_clean('df_counts_in1.pkl', 
               'df_counts_in2.pkl',
               remove_list,
               stopwords,
               'df_counts_in1.pkl',
               'df_counts_in2.pkl'
               )

In [80]:
df_counts_in_nostopwords = pd.read_pickle('df_counts_in2.pkl')

df_counts_in_nostopwords = dict(zip(df_counts_in_nostopwords[0], df_counts_in_nostopwords[1]))

wordcounts_print_sample(df_counts_in_nostopwords)

(0, ('government', 73660))
(1, ('time', 69275))
(2, ('state', 65463))
(3, ('new', 61042))
(4, ('people', 59907))
(5, ('come', 57492))
(6, ('like', 56214))
(7, ('work', 49861))
(8, ('company', 49197))
(9, ('country', 45834))
(10, ('use', 43461))
(11, ('high', 43275))
(12, ('minister', 41688))
(13, ('police', 37447))
(14, ('need', 37376))
(15, ('case', 37257))
(16, ('film', 36489))
(17, ('team', 34827))
(18, ('market', 34272))
(19, ('issue', 34170))

	 ... 

(280, ('near', 9795))
(281, ('old', 9742))
(282, ('consider', 9723))
(283, ('committee', 9719))
(284, ('foreign', 9700))
(285, ('earlier', 9641))
(286, ('accuse', 9582))
(287, ('device', 9580))
(288, ('let', 9561))
(289, ('quality', 9546))
(290, ('course', 9522))
(291, ('happen', 9478))
(292, ('seek', 9474))
(293, ('scheme', 9469))
(294, ('carry', 9464))
(295, ('stand', 9431))
(296, ('current', 9427))
(297, ('conduct', 9406))
(298, ('customer', 9387))
(299, ('station', 9375))


In [None]:
## Jamaica JM
read_sort_save('JM_domestic_Ngram_stopword_lematize.csv', 'df_jm.pkl')

In [None]:
wordcounts('df_jm.pkl', 'df_counts_jm1.pkl', 'df_counts_jm2.pkl')

In [None]:
remove_items = [
                'caribbean', 'jamaica', 'jamaican', 'st', 'jamaicans', 'kingston', 'montego', \
                'reggae', 'pnp'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [37]:
wordcounts_clean('df_counts_jm1.pkl', 
               'df_counts_jm2.pkl',
               remove_list,
               stopwords,
               'df_counts_jm1.pkl',
               'df_counts_jm2.pkl'
               )

In [81]:
df_counts_jm_nostopwords = pd.read_pickle('df_counts_jm2.pkl')

df_counts_jm_nostopwords = dict(zip(df_counts_jm_nostopwords[0], df_counts_jm_nostopwords[1]))

wordcounts_print_sample(df_counts_jm_nostopwords)

(0, ('time', 29117))
(1, ('minister', 29005))
(2, ('work', 27897))
(3, ('people', 27789))
(4, ('government', 26488))
(5, ('country', 25510))
(6, ('school', 25263))
(7, ('new', 23885))
(8, ('come', 22996))
(9, ('child', 21634))
(10, ('use', 21393))
(11, ('need', 21383))
(12, ('high', 20116))
(13, ('include', 19472))
(14, ('person', 19439))
(15, ('community', 18486))
(16, ('public', 18438))
(17, ('development', 18362))
(18, ('national', 18258))
(19, ('business', 18248))

	 ... 

(280, ('develop', 4744))
(281, ('final', 4718))
(282, ('write', 4715))
(283, ('bring', 4711))
(284, ('black', 4687))
(285, ('reduce', 4679))
(286, ('manager', 4671))
(287, ('comment', 4654))
(288, ('foreign', 4652))
(289, ('lose', 4623))
(290, ('initiative', 4623))
(291, ('risk', 4618))
(292, ('amp', 4603))
(293, ('environment', 4596))
(294, ('citizen', 4585))
(295, ('encourage', 4559))
(296, ('away', 4540))
(297, ('partner', 4538))
(298, ('covid', 4521))
(299, ('competition', 4500))


In [None]:
## Kenya KE
read_sort_save('KE_domestic_Ngram_stopword_lematize.csv', 'df_ke.pkl')

In [None]:
wordcounts('df_ke.pkl', 'df_counts_ke1.pkl', 'df_counts_ke2.pkl')

In [None]:
remove_items = [
                'kenya', 'nairobi', 'raila', 'uhuru', 'mp', 'mombasa', 'odinga', 'kenyan', 'ruto', \
                'kenyatta' 
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [40]:
wordcounts_clean('df_counts_ke1.pkl', 
               'df_counts_ke2.pkl',
               remove_list,
               stopwords,
               'df_counts_ke1.pkl',
               'df_counts_ke2.pkl'
               )

In [82]:
df_counts_ke_nostopwords = pd.read_pickle('df_counts_ke2.pkl')

df_counts_ke_nostopwords = dict(zip(df_counts_ke_nostopwords[0], df_counts_ke_nostopwords[1]))

wordcounts_print_sample(df_counts_ke_nostopwords)

(0, ('government', 28224))
(1, ('country', 25395))
(2, ('people', 22643))
(3, ('time', 21596))
(4, ('president', 21091))
(5, ('county', 20705))
(6, ('come', 17559))
(7, ('work', 16792))
(8, ('new', 16562))
(9, ('use', 16317))
(10, ('police', 15603))
(11, ('national', 15497))
(12, ('public', 14465))
(13, ('court', 13989))
(14, ('high', 13657))
(15, ('like', 13510))
(16, ('service', 13347))
(17, ('need', 13321))
(18, ('include', 12640))
(19, ('school', 12497))

	 ... 

(280, ('rule', 3616))
(281, ('question', 3602))
(282, ('game', 3588))
(283, ('operation', 3581))
(284, ('final', 3578))
(285, ('travel', 3552))
(286, ('clear', 3545))
(287, ('action', 3537))
(288, ('vehicle', 3536))
(289, ('station', 3517))
(290, ('effort', 3510))
(291, ('far', 3486))
(292, ('award', 3472))
(293, ('produce', 3470))
(294, ('tax', 3467))
(295, ('target', 3462))
(296, ('building', 3459))
(297, ('programme', 3453))
(298, ('economy', 3448))
(299, ('period', 3441))


In [None]:
## Sri Lanka LK
read_sort_save('LK_domestic_Ngram_stopword_lematize.csv', 'df_lk.pkl')

In [None]:
wordcounts('df_lk.pkl', 'df_counts_lk1.pkl', 'df_counts_lk2.pkl')

In [None]:
remove_items = [
                'sri', 'lanka', 'colombo', 'lankan', 'tamil', 'rs', 'rajapaksa', 'sirisena', \
                'sinhala', 'buddhist', 'tamils', 'mahinda', 'ceylon'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [43]:
wordcounts_clean('df_counts_lk1.pkl', 
               'df_counts_lk2.pkl',
               remove_list,
               stopwords,
               'df_counts_lk1.pkl',
               'df_counts_lk2.pkl'
               )

In [83]:
df_counts_lk_nostopwords = pd.read_pickle('df_counts_lk2.pkl')

df_counts_lk_nostopwords = dict(zip(df_counts_lk_nostopwords[0], df_counts_lk_nostopwords[1]))

wordcounts_print_sample(df_counts_lk_nostopwords)

(0, ('country', 18584))
(1, ('government', 17051))
(2, ('people', 13226))
(3, ('president', 12330))
(4, ('minister', 11345))
(5, ('time', 10794))
(6, ('state', 9263))
(7, ('new', 9108))
(8, ('come', 7976))
(9, ('use', 7914))
(10, ('work', 7629))
(11, ('high', 7518))
(12, ('world', 7488))
(13, ('need', 7023))
(14, ('include', 6911))
(15, ('international', 6898))
(16, ('national', 6603))
(17, ('development', 6586))
(18, ('issue', 6155))
(19, ('political', 6095))

	 ... 

(280, ('institution', 1789))
(281, ('recent', 1786))
(282, ('launch', 1782))
(283, ('represent', 1781))
(284, ('history', 1780))
(285, ('facility', 1780))
(286, ('investigation', 1770))
(287, ('vehicle', 1761))
(288, ('living', 1761))
(289, ('recently', 1757))
(290, ('request', 1756))
(291, ('produce', 1753))
(292, ('training', 1751))
(293, ('away', 1751))
(294, ('left', 1750))
(295, ('remain', 1748))
(296, ('story', 1745))
(297, ('appoint', 1744))
(298, ('travel', 1744))
(299, ('website', 1742))


In [None]:
## Malaysia MY
read_sort_save('MY_domestic_Ngram_stopword_lematize.csv', 'df_my.pkl')

In [None]:
wordcounts('df_my.pkl', 'df_counts_my1.pkl', 'df_counts_my2.pkl')

In [None]:
remove_items = [
                'malaysia', 'bhd', 'malaysians', 'sabah', 'lumpur', 'umno', 'kuala', 'datuk', \
                'najib', 'bn', 'malay', 'pas', 'mohd'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [46]:
wordcounts_clean('df_counts_my1.pkl', 
               'df_counts_my2.pkl',
               remove_list,
               stopwords,
               'df_counts_my1.pkl',
               'df_counts_my2.pkl'
               )

In [84]:
df_counts_my_nostopwords = pd.read_pickle('df_counts_my2.pkl')

df_counts_my_nostopwords = dict(zip(df_counts_my_nostopwords[0], df_counts_my_nostopwords[1]))

wordcounts_print_sample(df_counts_my_nostopwords)

(0, ('government', 34692))
(1, ('country', 25729))
(2, ('people', 25653))
(3, ('new', 24631))
(4, ('time', 24115))
(5, ('use', 23905))
(6, ('minister', 21194))
(7, ('state', 20642))
(8, ('company', 18220))
(9, ('high', 18003))
(10, ('work', 17900))
(11, ('like', 17897))
(12, ('come', 17464))
(13, ('need', 17413))
(14, ('comment', 17125))
(15, ('right', 15180))
(16, ('public', 15155))
(17, ('group', 15099))
(18, ('report', 15070))
(19, ('include', 14931))

	 ... 

(280, ('accord', 4053))
(281, ('stop', 4050))
(282, ('instead', 4049))
(283, ('begin', 4045))
(284, ('revenue', 4040))
(285, ('capital', 4038))
(286, ('deal', 4020))
(287, ('special', 4006))
(288, ('past', 3961))
(289, ('reduce', 3946))
(290, ('office', 3927))
(291, ('chairman', 3927))
(292, ('learn', 3927))
(293, ('profit', 3923))
(294, ('performance', 3918))
(295, ('opposition', 3917))
(296, ('customer', 3896))
(297, ('understand', 3877))
(298, ('body', 3877))
(299, ('require', 3875))


In [None]:
## Nigeria NG
read_sort_save('NG_domestic_Ngram_stopword_lematize.csv', 'df_ng.pkl')

In [None]:
wordcounts('df_ng.pkl', 'df_counts_ng1.pkl', 'df_counts_ng2.pkl')

In [None]:
remove_items = [
                'nigeria', 'nigerians', 'lagos', 'abuja', 'pdp', 'apc', 'nigerian', 'buhari', \
                'boko', 'niger'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [49]:
wordcounts_clean('df_counts_ng1.pkl', 
               'df_counts_ng2.pkl',
               remove_list,
               stopwords,
               'df_counts_ng1.pkl',
               'df_counts_ng2.pkl'
               )

In [85]:
df_counts_ng_nostopwords = pd.read_pickle('df_counts_ng2.pkl')

df_counts_ng_nostopwords = dict(zip(df_counts_ng_nostopwords[0], df_counts_ng_nostopwords[1]))

wordcounts_print_sample(df_counts_ng_nostopwords)

(0, ('state', 166160))
(1, ('government', 101682))
(2, ('people', 87846))
(3, ('country', 77670))
(4, ('president', 63295))
(5, ('time', 53674))
(6, ('come', 50587))
(7, ('governor', 44600))
(8, ('national', 41904))
(9, ('know', 41285))
(10, ('like', 40458))
(11, ('federal', 39649))
(12, ('new', 39288))
(13, ('need', 39026))
(14, ('work', 36905))
(15, ('use', 36815))
(16, ('party', 34917))
(17, ('election', 33964))
(18, ('member', 33079))
(19, ('issue', 32753))

	 ... 

(280, ('away', 9264))
(281, ('access', 9255))
(282, ('special', 9247))
(283, ('rule', 9247))
(284, ('past', 9245))
(285, ('water', 9180))
(286, ('food', 9119))
(287, ('appeal', 9099))
(288, ('worker', 9090))
(289, ('platform', 9066))
(290, ('secretary', 9060))
(291, ('campaign', 9023))
(292, ('represent', 9013))
(293, ('conduct', 8995))
(294, ('view', 8989))
(295, ('require', 8945))
(296, ('demand', 8898))
(297, ('available', 8894))
(298, ('resource', 8888))
(299, ('quality', 8821))


In [None]:
## New Zealand NZ
read_sort_save('NZ_domestic_Ngram_stopword_lematize.csv', 'df_nz.pkl')

In [None]:
wordcounts('df_nz.pkl', 'df_counts_nz1.pkl', 'df_counts_nz2.pkl')

In [None]:
remove_items = [
                'auckland', 'gt', 'nz', 'zealand', 'christchurch', 'maori', 'kiwi', 'zealanders', \
                'otago', 'te', ' ', 'ers', 'dunedin'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [52]:
wordcounts_clean('df_counts_nz1.pkl', 
               'df_counts_nz2.pkl',
               remove_list,
               stopwords,
               'df_counts_nz1.pkl',
               'df_counts_nz2.pkl'
               )

In [86]:
df_counts_nz_nostopwords = pd.read_pickle('df_counts_nz2.pkl')

df_counts_nz_nostopwords = dict(zip(df_counts_nz_nostopwords[0], df_counts_nz_nostopwords[1]))

wordcounts_print_sample(df_counts_nz_nostopwords)

(0, ('people', 75023))
(1, ('time', 66007))
(2, ('work', 62730))
(3, ('new', 56268))
(4, ('come', 45495))
(5, ('like', 44970))
(6, ('need', 43598))
(7, ('use', 40259))
(8, ('government', 36245))
(9, ('include', 34029))
(10, ('look', 33152))
(11, ('business', 33040))
(12, ('want', 32814))
(13, ('company', 32397))
(14, ('high', 32369))
(15, ('way', 31763))
(16, ('world', 31423))
(17, ('good', 31215))
(18, ('know', 30530))
(19, ('change', 29393))

	 ... 

(280, ('clear', 7988))
(281, ('idea', 7960))
(282, ('improve', 7909))
(283, ('finish', 7888))
(284, ('learn', 7877))
(285, ('prime', 7823))
(286, ('sure', 7817))
(287, ('worker', 7816))
(288, ('covid', 7816))
(289, ('claim', 7813))
(290, ('organisation', 7810))
(291, ('significant', 7804))
(292, ('later', 7799))
(293, ('management', 7796))
(294, ('save', 7788))
(295, ('announce', 7748))
(296, ('district', 7740))
(297, ('environment', 7738))
(298, ('supply', 7731))
(299, ('age', 7730))


In [None]:
## Philippines PH
read_sort_save('PH_domestic_Ngram_stopword_lematize.csv', 'df_ph.pkl')

In [None]:
wordcounts('df_ph.pkl', 'df_counts_ph1.pkl', 'df_counts_ph2.pkl')

In [None]:
remove_items = [
                'f', 'fr', 'n', 'r', 'wh', 'are', 'ne', 'or', 'al', 'ut', 'manila', 'ver', \
                'philippine', 'filipino', 'pint', 'duterte', 'de', 'g', 'curt', 'd', 'filipinos', \
                'ff', 'barangay', 'll', 't', 'me', 's', 'sunstar', 'webster', 'a'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [None]:
from textblob import TextBlob
 
# a = "cmputr"           # incorrect spelling
# print("original text: "+str(a))

counts_ph2_fix = {}

for i, j in counts_ph2_copy.items():
    b = TextBlob(str(i))
    counts_ph2_fix[str(b.correct())] = j
    
# b = TextBlob(a)
 
# prints the corrected spelling
# print("corrected text: "+str(b.correct()))

In [89]:
wordcounts_clean('df_counts_ph1.pkl', 
               'df_counts_ph2.pkl',
               remove_list,
               stopwords,
               'df_counts_ph1.pkl',
               'df_counts_ph2.pkl'
               )

In [90]:
df_counts_ph_nostopwords = pd.read_pickle('df_counts_ph2.pkl')

df_counts_ph_nostopwords = dict(zip(df_counts_ph_nostopwords[0], df_counts_ph_nostopwords[1]))

wordcounts_print_sample(df_counts_ph_nostopwords)

(0, ('frm', 158563))
(1, ('nt', 126112))
(2, ('mre', 62080))
(3, ('city', 59707))
(4, ('ur', 53152))
(5, ('gvernment', 52619))
(6, ('wuld', 51676))
(7, ('time', 50058))
(8, ('ther', 48107))
(9, ('yu', 44998))
(10, ('abut', 44134))
(11, ('peple', 43767))
(12, ('president', 42575))
(13, ('tw', 40438))
(14, ('new', 39723))
(15, ('use', 38808))
(16, ('like', 36488))
(17, ('cuntry', 36090))
(18, ('nly', 34588))
(19, ('nw', 33001))

	 ... 

(280, ('men', 7738))
(281, ('challenge', 7729))
(282, ('address', 7720))
(283, ('test', 7679))
(284, ('live', 7677))
(285, ('study', 7654))
(286, ('administratin', 7646))
(287, ('arund', 7611))
(288, ('facility', 7609))
(289, ('peratins', 7594))
(290, ('pay', 7570))
(291, ('suth', 7529))
(292, ('design', 7514))
(293, ('far', 7460))
(294, ('miss', 7457))
(295, ('let', 7457))
(296, ('plant', 7412))
(297, ('arrest', 7401))
(298, ('reflect', 7385))
(299, ('smething', 7375))


In [60]:
## FIX SPELLING ERRORS IN DICTIONARY WITH TEXTBLOB function, REPLACE FILE

In [91]:
from textblob import TextBlob

# a = "cmputr"           # incorrect spelling
# print("original text: "+str(a))
# b = TextBlob(a)
 
# prints the corrected spelling
# print("corrected text: "+str(b.correct()))

df_counts_ph_nostopwords_fix = {}

for i, j in df_counts_ph_nostopwords.items():
    b = TextBlob(str(i))
    df_counts_ph_nostopwords_fix[str(b.correct())] = j

wordcounts_print_sample(df_counts_ph_nostopwords_fix)

with open('df_counts_ph2.pkl', 'wb') as handle:
    pickle.dump(df_counts_ph_nostopwords_fix, handle, protocol=pickle.HIGHEST_PROTOCOL)



(0, ('from', 158563))
(1, ('are', 62080))
(2, ('city', 59707))
(3, ('or', 53152))
(4, ('government', 52619))
(5, ('would', 51676))
(6, ('time', 50058))
(7, ('you', 44998))
(8, ('but', 44134))
(9, ('people', 43767))
(10, ('president', 42575))
(11, ('new', 39723))
(12, ('use', 38808))
(13, ('like', 36488))
(14, ('country', 36090))
(15, ('only', 34588))
(16, ('national', 31992))
(17, ('right', 29393))
(18, ('through', 27853))
(19, ('high', 27290))

	 ... 

(280, ('meeting', 7349))
(281, ('budget', 7328))
(282, ('air', 7301))
(283, ('fact', 7288))
(284, ('island', 7276))
(285, ('vehicle', 7242))
(286, ('performance', 7238))
(287, ('training', 7233))
(288, ('class', 7213))
(289, ('coming', 7191))
(290, ('webster', 7183))
(291, ('known', 7138))
(292, ('ensure', 7106))
(293, ('real', 7066))
(294, ('available', 7037))
(295, ('agreement', 7017))
(296, ('various', 7008))
(297, ('ban', 7008))
(298, ('fire', 7008))
(299, ('early', 7001))


In [None]:
## Pakistan PK
## Omitted due to insufficient articles from NOW data

In [None]:
## Singapore SG
read_sort_save('SG_domestic_Ngram_stopword_lematize.csv', 'df_sg.pkl')

In [None]:
wordcounts('df_sg.pkl', 'df_counts_sg1.pkl', 'df_counts_sg2.pkl')

In [63]:
remove_items = [
                's', 'singapore', 'singaporeans', 'asia', 'facebook'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

[None, None]

In [64]:
wordcounts_clean('df_counts_sg1.pkl', 
               'df_counts_sg2.pkl',
               remove_list,
               stopwords,
               'df_counts_sg1.pkl',
               'df_counts_sg2.pkl'
               )

In [92]:
df_counts_sg_nostopwords = pd.read_pickle('df_counts_sg2.pkl')

df_counts_sg_nostopwords = dict(zip(df_counts_sg_nostopwords[0], df_counts_sg_nostopwords[1]))

wordcounts_print_sample(df_counts_ph_nostopwords_fix)

(0, ('from', 158563))
(1, ('are', 62080))
(2, ('city', 59707))
(3, ('or', 53152))
(4, ('government', 52619))
(5, ('would', 51676))
(6, ('time', 50058))
(7, ('you', 44998))
(8, ('but', 44134))
(9, ('people', 43767))
(10, ('president', 42575))
(11, ('new', 39723))
(12, ('use', 38808))
(13, ('like', 36488))
(14, ('country', 36090))
(15, ('only', 34588))
(16, ('national', 31992))
(17, ('right', 29393))
(18, ('through', 27853))
(19, ('high', 27290))

	 ... 

(280, ('meeting', 7349))
(281, ('budget', 7328))
(282, ('air', 7301))
(283, ('fact', 7288))
(284, ('island', 7276))
(285, ('vehicle', 7242))
(286, ('performance', 7238))
(287, ('training', 7233))
(288, ('class', 7213))
(289, ('coming', 7191))
(290, ('webster', 7183))
(291, ('known', 7138))
(292, ('ensure', 7106))
(293, ('real', 7066))
(294, ('available', 7037))
(295, ('agreement', 7017))
(296, ('various', 7008))
(297, ('ban', 7008))
(298, ('fire', 7008))
(299, ('early', 7001))


In [None]:
## Tanzania TZ
read_sort_save('TZ_domestic_Ngram_stopword_lematize.csv', 'df_tz.pkl')

In [None]:
wordcounts('df_tz.pkl', 'df_counts_tz1.pkl', 'df_counts_tz2.pkl')

In [None]:
remove_items = [
                'salaam', 'dar', 'e', 'tanzania', 'zanzibar', 'tanzanians', 'prof'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [67]:
wordcounts_clean('df_counts_tz1.pkl', 
               'df_counts_tz2.pkl',
               remove_list,
               stopwords,
               'df_counts_tz1.pkl',
               'df_counts_tz2.pkl'
               )

In [93]:
df_counts_tz_nostopwords = pd.read_pickle('df_counts_tz2.pkl')

df_counts_tz_nostopwords = dict(zip(df_counts_tz_nostopwords[0], df_counts_tz_nostopwords[1]))

wordcounts_print_sample(df_counts_tz_nostopwords)

(0, ('government', 10537))
(1, ('country', 9670))
(2, ('people', 6117))
(3, ('use', 5641))
(4, ('development', 4884))
(5, ('project', 4395))
(6, ('include', 4188))
(7, ('need', 4073))
(8, ('work', 4071))
(9, ('new', 4009))
(10, ('service', 3995))
(11, ('minister', 3945))
(12, ('president', 3807))
(13, ('time', 3698))
(14, ('business', 3576))
(15, ('sector', 3523))
(16, ('school', 3431))
(17, ('national', 3425))
(18, ('company', 3355))
(19, ('public', 3353))

	 ... 

(280, ('rural', 884))
(281, ('port', 883))
(282, ('plant', 883))
(283, ('key', 877))
(284, ('income', 876))
(285, ('bring', 869))
(286, ('global', 869))
(287, ('free', 863))
(288, ('direct', 863))
(289, ('measure', 858))
(290, ('farm', 857))
(291, ('join', 851))
(292, ('control', 851))
(293, ('body', 850))
(294, ('view', 849))
(295, ('despite', 846))
(296, ('tourism', 845))
(297, ('complete', 844))
(298, ('recently', 844))
(299, ('network', 842))


In [None]:
## United States US
read_sort_save('US_domestic_Ngram_stopword_lematize.csv', 'df_us.pkl')

In [None]:
wordcounts('df_us.pkl', 'df_counts_us1.pkl', 'df_counts_us2.pkl')

In [None]:
remove_items = [
                'trump', 'facebook', 'obama'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [70]:
wordcounts_clean('df_counts_us1.pkl', 
               'df_counts_us2.pkl',
               remove_list,
               stopwords,
               'df_counts_us1.pkl',
               'df_counts_us2.pkl'
               )

In [94]:
df_counts_us_nostopwords = pd.read_pickle('df_counts_us2.pkl')

df_counts_us_nostopwords = dict(zip(df_counts_us_nostopwords[0], df_counts_us_nostopwords[1]))

wordcounts_print_sample(df_counts_us_nostopwords)

(0, ('people', 92775))
(1, ('time', 91124))
(2, ('like', 90828))
(3, ('new', 74341))
(4, ('use', 70516))
(5, ('work', 67046))
(6, ('come', 59949))
(7, ('know', 58216))
(8, ('way', 51141))
(9, ('want', 47687))
(10, ('look', 45199))
(11, ('think', 45198))
(12, ('world', 43895))
(13, ('include', 43605))
(14, ('company', 43497))
(15, ('need', 43304))
(16, ('state', 42442))
(17, ('life', 42273))
(18, ('right', 41205))
(19, ('high', 41018))

	 ... 

(280, ('clear', 10851))
(281, ('moment', 10830))
(282, ('available', 10789))
(283, ('leave', 10771))
(284, ('fight', 10768))
(285, ('air', 10763))
(286, ('phone', 10761))
(287, ('mother', 10760))
(288, ('bring', 10682))
(289, ('model', 10595))
(290, ('hope', 10591))
(291, ('content', 10589))
(292, ('strong', 10556))
(293, ('stand', 10543))
(294, ('light', 10537))
(295, ('account', 10531))
(296, ('college', 10506))
(297, ('outside', 10499))
(298, ('access', 10493))
(299, ('network', 10449))


In [7]:
## South Africa ZA
## Omitted due to insufficient articles

In [23]:
df_counts_au_nostopwords = pd.read_pickle('df_counts_au2.pkl')
dict_au = dict(zip(df_counts_au_nostopwords[0], df_counts_au_nostopwords[1]))
dict_au = dict(Counter(dict_au).most_common(300))

df_counts_bd_nostopwords = pd.read_pickle('df_counts_bd2.pkl')
dict_bd = dict(zip(df_counts_bd_nostopwords[0], df_counts_bd_nostopwords[1]))
dict_bd = dict(Counter(dict_bd).most_common(300))

df_counts_ca_nostopwords = pd.read_pickle('df_counts_ca2.pkl')
dict_ca = dict(zip(df_counts_ca_nostopwords[0], df_counts_ca_nostopwords[1]))
dict_ca = dict(Counter(dict_ca).most_common(300))

df_counts_gb_nostopwords = pd.read_pickle('df_counts_gb2.pkl')
dict_gb = dict(zip(df_counts_gb_nostopwords[0], df_counts_gb_nostopwords[1]))
dict_gb = dict(Counter(dict_gb).most_common(300))

df_counts_gh_nostopwords = pd.read_pickle('df_counts_gh2.pkl')
dict_gh = dict(zip(df_counts_gh_nostopwords[0], df_counts_gh_nostopwords[1]))
dict_gh = dict(Counter(dict_gh).most_common(300))

df_counts_hk_nostopwords = pd.read_pickle('df_counts_hk2.pkl')
dict_hk = dict(zip(df_counts_hk_nostopwords[0], df_counts_hk_nostopwords[1]))
dict_hk = dict(Counter(dict_hk).most_common(300))

df_counts_ie_nostopwords = pd.read_pickle('df_counts_ie2.pkl')
dict_ie = dict(zip(df_counts_ie_nostopwords[0], df_counts_ie_nostopwords[1]))
dict_ie = dict(Counter(dict_ie).most_common(300))

df_counts_in_nostopwords = pd.read_pickle('df_counts_in2.pkl')
dict_in = dict(zip(df_counts_in_nostopwords[0], df_counts_in_nostopwords[1]))
dict_in = dict(Counter(dict_in).most_common(300))

df_counts_jm_nostopwords = pd.read_pickle('df_counts_jm2.pkl')
dict_jm = dict(zip(df_counts_jm_nostopwords[0], df_counts_jm_nostopwords[1]))
dict_jm = dict(Counter(dict_jm).most_common(300))

df_counts_ke_nostopwords = pd.read_pickle('df_counts_ke2.pkl')
dict_ke = dict(zip(df_counts_ke_nostopwords[0], df_counts_ke_nostopwords[1]))
dict_ke = dict(Counter(dict_ke).most_common(300))

df_counts_lk_nostopwords = pd.read_pickle('df_counts_lk2.pkl')
dict_lk = dict(zip(df_counts_lk_nostopwords[0], df_counts_lk_nostopwords[1]))
dict_lk = dict(Counter(dict_lk).most_common(300))

df_counts_my_nostopwords = pd.read_pickle('df_counts_my2.pkl')
dict_my = dict(zip(df_counts_my_nostopwords[0], df_counts_my_nostopwords[1]))
dict_my = dict(Counter(dict_my).most_common(300))

df_counts_ng_nostopwords = pd.read_pickle('df_counts_ng2.pkl')
dict_ng = dict(zip(df_counts_ng_nostopwords[0], df_counts_ng_nostopwords[1]))
dict_ng = dict(Counter(dict_ng).most_common(300))

df_counts_nz_nostopwords = pd.read_pickle('df_counts_nz2.pkl')
dict_nz = dict(zip(df_counts_nz_nostopwords[0], df_counts_nz_nostopwords[1]))
dict_nz = dict(Counter(dict_nz).most_common(300))

df_counts_ph_nostopwords = pd.read_pickle('df_counts_ph2.pkl')
dict_ph = df_counts_ph_nostopwords
dict_ph = dict(Counter(dict_ph).most_common(300))

df_counts_sg_nostopwords = pd.read_pickle('df_counts_sg2.pkl')
dict_sg = dict(zip(df_counts_sg_nostopwords[0], df_counts_sg_nostopwords[1]))
dict_sg = dict(Counter(dict_sg).most_common(300))

df_counts_tz_nostopwords = pd.read_pickle('df_counts_tz2.pkl')
dict_tz = dict(zip(df_counts_tz_nostopwords[0], df_counts_tz_nostopwords[1]))
dict_tz = dict(Counter(dict_tz).most_common(300))

df_counts_us_nostopwords = pd.read_pickle('df_counts_us2.pkl')
dict_us = dict(zip(df_counts_us_nostopwords[0], df_counts_us_nostopwords[1]))
dict_us = dict(Counter(dict_us).most_common(300))

In [25]:
data = [dict_au,dict_bd,dict_ca,dict_gb,dict_gh,dict_hk,dict_ie,dict_in,dict_jm,dict_ke,dict_lk,dict_my,dict_ng,dict_nz,dict_ph,dict_sg,dict_tz,dict_us]

In [27]:
all_countries_no_stopwords = pd.DataFrame.from_dict(data)
all_countries_no_stopwords.index = ['Australia AU', 'Bangladesh BD', 'Canada CA', 'United Kingdom GB',
                                    'Ghana GH','Hong Kong HK','Ireland IE','India IN',
                                    'Jamaica JM','Kenya KE','Sri Lanka LK','Malaysia MY',
                                    'Nigeria NG','New Zealand NZ','Philippines PH','Singapore SG',
                                    'Tanzania TZ','United States US']

all_countries_no_stopwords

Unnamed: 0,time,people,like,new,work,use,come,need,government,look,...,natural,meet,implementation,infrastructure,union,rural,port,white,character,effect
Australia AU,99918,83357,73990,69815,65096,63113,57492,51243.0,51129,49322.0,...,,,,,,,,,,
Bangladesh BD,12261,19966,7089,8614,11527,8819,9810,7700.0,20966,3217.0,...,,,,,,,,,,
Canada CA,101831,96695,78735,80842,78459,81684,69959,51774.0,49154,55700.0,...,,,,,,,,,,
United Kingdom GB,48544,48863,39139,37046,38795,32345,33054,23568.0,18847,24954.0,...,,,,,,,,,,
Ghana GH,16691,22788,9709,14518,14492,13881,14847,13132.0,26010,5758.0,...,,,,,,,,,,
Hong Kong HK,3051,3349,1585,2787,2053,1928,1723,1424.0,3738,987.0,...,,,,,,,,,,
Ireland IE,77019,61005,42155,47556,48931,72634,50201,28496.0,18515,31373.0,...,,,,,,,,,,
India IN,69275,59907,56214,61042,49861,43461,57492,37376.0,73660,30884.0,...,,,,,,,,,,
Jamaica JM,29117,27789,16414,23885,27897,21393,22996,21383.0,26488,11887.0,...,,,,,,,,,,
Kenya KE,21596,22643,13510,16562,16792,16317,17559,13321.0,28224,7517.0,...,,,,,,,,,,
