In [None]:
import pandas as pd
import spacy
from collections import Counter
import time
import pickle
import multiprocessing as mp
from multiprocessing import Pool, Process

# from functions import read_sort_save, replace_content, process_text, split, wordcounts_clean

# PART 1 - Preprocessing

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
stopwords = nlp.Defaults.stop_words

In [4]:
print(len(stopwords))
print(stopwords)

326
{'under', 'yourself', 'wherever', 'which', 'neither', 'since', 'these', 'and', 'everyone', 'thence', 'whoever', 'off', 'or', 'such', 'bottom', 'least', 'unless', 'someone', 'they', 'whenever', 'be', 'she', 'therein', 'yourselves', 'themselves', 'herein', 'ourselves', 'six', 'put', 'until', 'keep', 'should', 'front', 'before', 'take', 'former', 'you', 'further', 'now', 'formerly', 'none', 'done', 'top', 'their', 'three', 'some', 'against', 'does', 'except', 'hereafter', 'hereby', 'while', "'s", 'seeming', 'too', 'ten', 'n‘t', 'why', 'below', 'myself', 'less', 'say', 'whereafter', 'beforehand', 'upon', 'sometime', 'seemed', 'seem', 'get', 'anyhow', 'is', 'may', 'where', 'most', 'this', 'becomes', 'elsewhere', 'afterwards', 'also', '‘m', 'become', 'whose', 'nowhere', 'yours', 'used', 'down', "n't", 'was', 'our', 'without', 'might', 'together', 'whereupon', 'its', 'will', 'still', '‘d', 'if', 'has', 'beyond', 'then', 'as', 'with', 'once', 'hundred', 'above', 'there', 'please', 'made', 

In [5]:
redict = {
"are n't" : "aren't",   "ca n't" : "can't",     "could n't" : "couldn't",   "did n't" : "didn't",       "does n't" : "doesn't",\
"do n't" : "don't",     "had n't" : "hadn't",   "has n't" : "hasn't",       "have n't" : "haven't",     "h e'd" : "he'd",\
"h e'll" : "he'll",     "h e's" : "he's",       "i 'd" : "i'd",             "i 'll" : "i'll",           "i 'm" : "i'm",\
"i 've" : "i've",       "is n't" : "isn't",     "i t's" : "it's",           "le t's" : "let's",         "must n't" : "mustn't",\
"sha n't" : "shan't",   "sh e'd" : "she'd",     "sh e'll" : "she'll",       "sh e's" : "she's",         "should n't" : "shouldn't",\
"tha t's" : "that's",   "ther e's" : "there's", "the y'll" : "they'll",     "the y're" : "they're",     "the y've" : "they've",\
"w e'd" : "we'd",       "w e're" : "we're",     "w e've" : "we've",         "were n't" : "weren't",     "wha t'll" : "what'll",\
"wha t're" : "what're", "wha t's" : "what's",   "wha t've" : "what've",     "wher e's" : "where's",     "wh o'd" : "who'd",\
"wh o'll" : "who'll",   "wh o're" : "who're",   "wh o's" : "who's",         "wh o've" : "who've",       "wo n't" : "won't",\
"would n't" : "wouldn't","yo u'd" : "you'd",    "yo u'll" : "you'll",       "yo u're" : "you're",       "yo u've" : "you've",\
# " 's" : "'s",         # " 're": "'re",    
"new zealand" : "",     "<p>" : "",             "<h>" : "",                 " @ " : "",                 "@" : "",
"\n" : ""
}

In [6]:
## List of items to remove from word count dictionaries

In [7]:
remove_list= [
    "'s", "n't", "'re", "u", "ve", \
    "'", "''", "'m", "/", "'ll", \
    "*", "'d", "'ve", "m", "mr", \
    "ms", "dr", "mrs", "."
]

In [18]:
## Function based on dict, replaces key with the value on the target

In [19]:
def replace_content(dict_replace, target):
    for check, replacer in list(dict_replace.items()):
        target = target.replace(check, replacer)

    return target

In [20]:
## Function to read .csv file as pandas dataframe, sort values by year, and save as .pkl file

In [21]:
def read_sort_save(filename1, filename2):
    dataframe = pd.read_csv(filename1, usecols=['year', 'article_text_Ngram_stopword_lemmatize'])
    dataframe.sort_values(by=['year'], inplace=True, ignore_index=True)
    dataframe.to_pickle(filename2)

In [22]:
## Function to create Word Count Dictionary which filters out stopwords and named entitites

## First, parse through article and lowercase all words

## Next, make corrections to words with apostrophers using redict dictionary

## Append words from processed document to list with stopwords (wordfreq1) or list without stopwords (wordfreq2)

## Return counter dictionaries of each list, save each list (stopwords/nostopwords) as .pkl files



In [23]:
## READ AND UNDERSTAND HOW FUNCTION WORKS

In [24]:
def split(iteration, n):  
   
   quotient, remainder = divmod(len(iteration), n)

   split_data = [
      
      iteration[
         # FLOOR
         i * quotient + min(i, remainder)
         :
         # CEILING
         (i + 1) * quotient + min(i + 1, remainder)

         ] for i in range(6)
   ]

   split_data_dictionary = {x[0] : x[1] for x in enumerate(split_data)}
   
   return split_data_dictionary

In [25]:
def process_text(text, L1, L2):
    text_data = text.lower()
    new_text = replace_content(redict, text_data)
    document = nlp(new_text)
    for ent in document:
        if ent.ent_type:
            L1.append(ent.text)
        else:
            L2.append(ent.text)
    

In [26]:
def wordcounts(file1_input, file1_output, file2_output):
    
    L1 = []
    L2 = []  

    dataframe = pd.read_pickle(file1_input)

    sub_list = split(dataframe['article_text_Ngram_stopword_lemmatize'], 6)

    for i in sub_list:
        for j in sub_list[i]:
            process_text(j, L1, L2)    

    count_stopwords = Counter(word for word in L1)
    count_nostopwords = Counter(word for word in L1)

    with open(file1_output, 'wb') as handle:
        pickle.dump(count_stopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    with open(file2_output, 'wb') as handle:
        pickle.dump(count_nostopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
## After checking word count lists, clean for named entities and nlp terms from data using removedict and stopword lists

In [28]:
def wordcounts_clean(file1_input, file2_input, remove_list, stopword_list, file1_output, file2_output):

    stopwords_dataframe = pd.read_pickle(file1_input)
    nostopwords_dataframe = pd.read_pickle(file2_input)

    count_stopwords = dict(zip(stopwords_dataframe[0], stopwords_dataframe[1]))

    count_nostopwords = dict(zip(nostopwords_dataframe[0], nostopwords_dataframe[1]))

    for i in remove_list:
        if i in count_stopwords:
            del count_stopwords[i]
        if i in count_nostopwords:
            del count_nostopwords[i]

    for i in stopword_list:
        if i in count_stopwords:
            del count_stopwords[i]
        if i in count_nostopwords:
            del count_nostopwords[i]

    df_count_stopwords = pd.DataFrame(Counter(count_stopwords).most_common(500))
    df_count_nostopwords = pd.DataFrame(Counter(count_nostopwords).most_common(500))

    with open(file1_output, 'wb') as handle:
        pickle.dump(df_count_stopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    with open(file2_output, 'wb') as handle:
        pickle.dump(df_count_nostopwords, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
## Open and read excel files as pandas dataframes: cols = ['year', 'article_text_Ngram_stopword_lemmatize']
## Sort articles by year
## Save as .pkl file

## Note: Pakistan (PK) and South Africa (ZA) were omitted due to insufficient articles

In [None]:
## Australia AU
read_sort_save('AU_domestic_Ngram_stopword_lematize.csv', 'df_au.pkl')

In [None]:
wordcounts('df_au.pkl', 'df_counts_au1.pkl', 'df_counts_au2.pkl')

In [11]:
remove_items = ["'s", "n't", "'re", "u", "ve", "'", "''", "'m", "/", "'ll", "*", "'d", "'ve", "m"]

[remove_list.append(i) for i in remove_items if i not in remove_list]

[]

In [12]:
wordcounts_clean('df_counts_au1.pkl', 
               'df_counts_au2.pkl',
               remove_list,
               stopwords,
               'df_counts_au1.pkl',
               'df_counts_au2.pkl'
               )

In [8]:
df_counts_au_nostopwords = pd.read_pickle('df_counts_au2.pkl')

df_counts_au_nostopwords = dict(zip(df_counts_au_nostopwords[0], df_counts_au_nostopwords[1]))

for i in enumerate(Counter(df_counts_au_nostopwords).most_common(300)):
    print(i)

(0, ('time', 99918))
(1, ('people', 83357))
(2, ('like', 73990))
(3, ('new', 69815))
(4, ('work', 65096))
(5, ('use', 63113))
(6, ('come', 57492))
(7, ('need', 51243))
(8, ('government', 51129))
(9, ('look', 49322))
(10, ('think', 47609))
(11, ('know', 46152))
(12, ('way', 45194))
(13, ('include', 45178))
(14, ('game', 43195))
(15, ('good', 41740))
(16, ('want', 41397))
(17, ('high', 39172))
(18, ('market', 38863))
(19, ('world', 38654))
(20, ('company', 36958))
(21, ('change', 36302))
(22, ('thing', 34861))
(23, ('big', 34666))
(24, ('right', 34097))
(25, ('business', 33743))
(26, ('report', 32855))
(27, ('team', 32075))
(28, ('start', 31979))
(29, ('day', 31683))
(30, ('home', 31660))
(31, ('information', 31310))
(32, ('great', 30983))
(33, ('play', 30863))
(34, ('state', 30849))
(35, ('life', 30432))
(36, ('provide', 29711))
(37, ('service', 29554))
(38, ('point', 28701))
(39, ('car', 28522))
(40, ('long', 28221))
(41, ('best', 27873))
(42, ('country', 27651))
(43, ('help', 27638))


In [None]:
## Bangladesh BD
read_sort_save('BD_domestic_Ngram_stopword_lematize.csv', 'df_bd.pkl')

In [None]:
wordcounts('df_bd.pkl', 'df_counts_bd1.pkl', 'df_counts_bd2.pkl')

In [None]:
remove_items = ['bangladeshi', 'sheikh', 'crore', \
                'tk', 'bnp', 'bangabandhu', 'bangladesh', \
               'rahman', 'hossain', 'hasina', 'upazila', \
               'bangabandhu', 'bangla', 'indian', 'myanmar', \
               'chittagong', 'khan', 'rohingya', 'chowdhury', 'prof'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [15]:
wordcounts_clean('df_counts_bd1.pkl', 
               'df_counts_bd2.pkl',
               remove_list,
               stopwords,
               'df_counts_bd1.pkl',
               'df_counts_bd2.pkl'
               )

In [16]:
df_counts_bd_nostopwords = pd.read_pickle('df_counts_bd2.pkl')

df_counts_bd_nostopwords = dict(zip(df_counts_bd_nostopwords[0], df_counts_bd_nostopwords[1]))

for i in enumerate(Counter(df_counts_bd_nostopwords).most_common(300)):
    print(i)

(0, ('country', 22510))
(1, ('government', 20966))
(2, ('people', 19966))
(3, ('minister', 12601))
(4, ('time', 12261))
(5, ('work', 11527))
(6, ('police', 10164))
(7, ('come', 9810))
(8, ('high', 9434))
(9, ('use', 8819))
(10, ('new', 8614))
(11, ('include', 8222))
(12, ('case', 8117))
(13, ('add', 8097))
(14, ('area', 7970))
(15, ('world', 7732))
(16, ('need', 7700))
(17, ('bank', 7421))
(18, ('official', 7324))
(19, ('report', 7164))
(20, ('day', 7131))
(21, ('like', 7089))
(22, ('student', 7014))
(23, ('party', 6932))
(24, ('state', 6917))
(25, ('law', 6701))
(26, ('project', 6697))
(27, ('member', 6590))
(28, ('development', 6557))
(29, ('told', 6551))
(30, ('international', 6515))
(31, ('leader', 6462))
(32, ('issue', 6457))
(33, ('national', 6452))
(34, ('number', 6256))
(35, ('city', 6221))
(36, ('general', 5950))
(37, ('power', 5937))
(38, ('woman', 5865))
(39, ('local', 5832))
(40, ('start', 5831))
(41, ('court', 5822))
(42, ('prime', 5749))
(43, ('market', 5736))
(44, ('chil

In [None]:
## Canada CA
read_sort_save('CA_domestic_Ngram_stopword_lematize.csv', 'df_ca.pkl')

In [None]:
wordcounts('df_ca.pkl', 'df_counts_ca1.pkl', 'df_counts_ca2.pkl')

In [None]:
remove_items = ['ottowa', 'ontario', 'cbc', 'toolong', 'vancouver' ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [18]:
wordcounts_clean('df_counts_ca1.pkl', 
               'df_counts_ca2.pkl',
               remove_list,
               stopwords,
               'df_counts_ca1.pkl',
               'df_counts_ca2.pkl'
               )

In [19]:
df_counts_ca_nostopwords = pd.read_pickle('df_counts_ca2.pkl')

df_counts_ca_nostopwords = dict(zip(df_counts_ca_nostopwords[0], df_counts_ca_nostopwords[1]))

for i in enumerate(Counter(df_counts_ca_nostopwords).most_common(300)):
    print(i)

(0, ('time', 101831))
(1, ('people', 96695))
(2, ('use', 81684))
(3, ('new', 80842))
(4, ('like', 78735))
(5, ('work', 78459))
(6, ('come', 69959))
(7, ('company', 62410))
(8, ('include', 60010))
(9, ('game', 57024))
(10, ('look', 55700))
(11, ('know', 54572))
(12, ('want', 54483))
(13, ('right', 52872))
(14, ('information', 52481))
(15, ('comment', 52001))
(16, ('need', 51774))
(17, ('way', 50782))
(18, ('city', 50557))
(19, ('team', 49435))
(20, ('government', 49154))
(21, ('think', 46044))
(22, ('home', 44341))
(23, ('help', 43253))
(24, ('high', 42619))
(25, ('community', 42170))
(26, ('good', 40758))
(27, ('change', 40473))
(28, ('service', 40302))
(29, ('business', 39974))
(30, ('play', 39818))
(31, ('world', 39759))
(32, ('life', 39724))
(33, ('family', 38974))
(34, ('share', 38609))
(35, ('start', 38515))
(36, ('day', 38415))
(37, ('thing', 37975))
(38, ('report', 37770))
(39, ('police', 36581))
(40, ('point', 35368))
(41, ('place', 35257))
(42, ('school', 34009))
(43, ('countr

In [None]:
## United Kingdom GB
read_sort_save('GB_domestic_Ngram_stopword_lematize.csv', 'df_gb.pkl')

In [None]:
wordcounts('df_gb.pkl', 'df_counts_gb1.pkl', 'df_counts_gb2.pkl')

In [None]:
remove_items = ['uk', 'trump']

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [21]:
wordcounts_clean('df_counts_gb1.pkl', 
               'df_counts_gb2.pkl',
               remove_list,
               stopwords,
               'df_counts_gb1.pkl',
               'df_counts_gb2.pkl'
               )

In [22]:
df_counts_gb_nostopwords = pd.read_pickle('df_counts_gb2.pkl')

df_counts_gb_nostopwords = dict(zip(df_counts_gb_nostopwords[0], df_counts_gb_nostopwords[1]))

for i in enumerate(Counter(df_counts_gb_nostopwords).most_common(300)):
    print(i)

(0, ('people', 48863))
(1, ('time', 48544))
(2, ('like', 39139))
(3, ('work', 38795))
(4, ('new', 37046))
(5, ('come', 33054))
(6, ('use', 32345))
(7, ('know', 25840))
(8, ('look', 24954))
(9, ('world', 24617))
(10, ('way', 24316))
(11, ('want', 24149))
(12, ('need', 23568))
(13, ('include', 22977))
(14, ('think', 21758))
(15, ('life', 21306))
(16, ('game', 20243))
(17, ('good', 19542))
(18, ('government', 18847))
(19, ('right', 18820))
(20, ('company', 18276))
(21, ('home', 18254))
(22, ('day', 18046))
(23, ('help', 17996))
(24, ('thing', 17946))
(25, ('high', 17662))
(26, ('place', 17316))
(27, ('start', 17209))
(28, ('play', 16909))
(29, ('change', 16735))
(30, ('end', 16689))
(31, ('team', 16662))
(32, ('big', 16455))
(33, ('great', 16354))
(34, ('set', 16106))
(35, ('country', 16075))
(36, ('best', 15408))
(37, ('provide', 15253))
(38, ('child', 15219))
(39, ('long', 15210))
(40, ('add', 15179))
(41, ('business', 15133))
(42, ('family', 15120))
(43, ('told', 15046))
(44, ('service

In [None]:
## Ghana GH
read_sort_save('GH_domestic_Ngram_stopword_lematize.csv', 'df_gh.pkl')

In [None]:
wordcounts('df_gh.pkl', 'df_counts_gh1.pkl', 'df_counts_gh2.pkl')

In [None]:
remove_items = ['ghana', 'accra', 'ghanaian', 'ghanaians', 'npp', 'ndc', 'addo', 'nana',\
               'mahama', 'akufo', 'gh', 'kumasi', 'fm', 'prof'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [24]:
wordcounts_clean('df_counts_gh1.pkl', 
               'df_counts_gh2.pkl',
               remove_list,
               stopwords,
               'df_counts_gh1.pkl',
               'df_counts_gh2.pkl'
               )

In [25]:
df_counts_gh_nostopwords = pd.read_pickle('df_counts_gh2.pkl')

df_counts_gh_nostopwords = dict(zip(df_counts_gh_nostopwords[0], df_counts_gh_nostopwords[1]))

for i in enumerate(Counter(df_counts_gh_nostopwords).most_common(300)):
    print(i)

(0, ('country', 28303))
(1, ('government', 26010))
(2, ('people', 22788))
(3, ('president', 21453))
(4, ('time', 16691))
(5, ('come', 14847))
(6, ('new', 14518))
(7, ('work', 14492))
(8, ('use', 13881))
(9, ('national', 13612))
(10, ('service', 13458))
(11, ('state', 13292))
(12, ('need', 13132))
(13, ('party', 13013))
(14, ('company', 12639))
(15, ('development', 12539))
(16, ('school', 12535))
(17, ('public', 12129))
(18, ('know', 11194))
(19, ('region', 11045))
(20, ('high', 10971))
(21, ('report', 10847))
(22, ('member', 10634))
(23, ('business', 10441))
(24, ('world', 9921))
(25, ('include', 9830))
(26, ('minister', 9791))
(27, ('like', 9709))
(28, ('support', 9496))
(29, ('education', 9393))
(30, ('issue', 9384))
(31, ('health', 9267))
(32, ('project', 9223))
(33, ('good', 9184))
(34, ('add', 9115))
(35, ('bank', 9024))
(36, ('police', 9006))
(37, ('lead', 8860))
(38, ('help', 8788))
(39, ('general', 8714))
(40, ('life', 8665))
(41, ('way', 8427))
(42, ('sector', 8274))
(43, ('ch

In [25]:
# Note: Hong Kong appears to have much less data than other countries

In [25]:
## Hong Kong HK
read_sort_save('HK_domestic_Ngram_stopword_lematize.csv', 'df_hk.pkl')

In [None]:
wordcounts('df_hk.pkl', 'df_counts_hk1.pkl', 'df_counts_hk2.pkl')

In [None]:
remove_items = ['lt', 'gt', '/p', 'p', 'hk', 'class=', 'http', 'asia', 'lam', 'co', \
               'chan', 'p1', 'beijing', 'href=', 'wp'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [28]:
wordcounts_clean('df_counts_hk1.pkl', 
               'df_counts_hk2.pkl',
               remove_list,
               stopwords,
               'df_counts_hk1.pkl',
               'df_counts_hk2.pkl'
               )

In [29]:
df_counts_hk_nostopwords = pd.read_pickle('df_counts_hk2.pkl')

df_counts_hk_nostopwords = dict(zip(df_counts_hk_nostopwords[0], df_counts_hk_nostopwords[1]))

for i in enumerate(Counter(df_counts_hk_nostopwords).most_common(300)):
    print(i)

(0, ('amp', 3972))
(1, ('company', 3826))
(2, ('government', 3738))
(3, ('people', 3349))
(4, ('business', 3084))
(5, ('time', 3051))
(6, ('market', 2900))
(7, ('new', 2787))
(8, ('law', 2549))
(9, ('report', 2392))
(10, ('include', 2237))
(11, ('work', 2053))
(12, ('public', 1988))
(13, ('city', 1940))
(14, ('high', 1933))
(15, ('use', 1928))
(16, ('country', 1920))
(17, ('bank', 1790))
(18, ('group', 1746))
(19, ('come', 1723))
(20, ('world', 1698))
(21, ('like', 1585))
(22, ('share', 1532))
(23, ('police', 1520))
(24, ('issue', 1513))
(25, ('legal', 1491))
(26, ('base', 1431))
(27, ('need', 1424))
(28, ('firm', 1362))
(29, ('price', 1337))
(30, ('lead', 1337))
(31, ('investor', 1322))
(32, ('service', 1318))
(33, ('investment', 1312))
(34, ('case', 1312))
(35, ('local', 1311))
(36, ('system', 1303))
(37, ('chief', 1280))
(38, ('right', 1265))
(39, ('large', 1223))
(40, ('executive', 1218))
(41, ('state', 1209))
(42, ('global', 1189))
(43, ('international', 1186))
(44, ('mainland', 1

In [None]:
## Ireland IE
read_sort_save('IE_domestic_Ngram_stopword_lematize.csv', 'df_ie.pkl')

In [None]:
wordcounts('df_ie.pkl', 'df_counts_ie1.pkl', 'df_counts_ie2.pkl')

In [None]:
remove_items = [
                'ireland', 'irish', 'dublin', 'cooky', 'limerick', 'co', 'galway', 'uk', 'derry', 'yea'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [31]:
wordcounts_clean('df_counts_ie1.pkl', 
               'df_counts_ie2.pkl',
               remove_list,
               stopwords,
               'df_counts_ie1.pkl',
               'df_counts_ie2.pkl'
               )

In [32]:
df_counts_ie_nostopwords = pd.read_pickle('df_counts_ie2.pkl')

df_counts_ie_nostopwords = dict(zip(df_counts_ie_nostopwords[0], df_counts_ie_nostopwords[1]))

for i in enumerate(Counter(df_counts_ie_nostopwords).most_common(300)):
    print(i)

(0, ('time', 77019))
(1, ('use', 72634))
(2, ('people', 61005))
(3, ('come', 50201))
(4, ('work', 48931))
(5, ('new', 47556))
(6, ('like', 42155))
(7, ('website', 37221))
(8, ('team', 32182))
(9, ('home', 32092))
(10, ('site', 31766))
(11, ('look', 31373))
(12, ('game', 29067))
(13, ('know', 29061))
(14, ('need', 28496))
(15, ('day', 28163))
(16, ('good', 28144))
(17, ('place', 28050))
(18, ('include', 27576))
(19, ('family', 27170))
(20, ('want', 27060))
(21, ('way', 27045))
(22, ('life', 26701))
(23, ('help', 26466))
(24, ('play', 25619))
(25, ('set', 25414))
(26, ('great', 25154))
(27, ('number', 24026))
(28, ('point', 23912))
(29, ('local', 23785))
(30, ('provide', 23756))
(31, ('start', 23279))
(32, ('service', 23151))
(33, ('think', 22947))
(34, ('change', 22733))
(35, ('visit', 22636))
(36, ('best', 22398))
(37, ('told', 22252))
(38, ('high', 22183))
(39, ('player', 22034))
(40, ('world', 21800))
(41, ('child', 21783))
(42, ('man', 20870))
(43, ('club', 20838))
(44, ('group', 20

In [None]:
## India IN
read_sort_save('IN_domestic_Ngram_stopword_lematize.csv', 'df_in.pkl')

In [None]:
wordcounts('df_in.pkl', 'df_counts_in1.pkl', 'df_counts_in2.pkl')

In [None]:
remove_items = [
               'rs', 'delhi', 'crore', 'ist', 'singh', 'gmt', 'bjp', 'modi', 'indi', 'lakh', \
               'pradesh', 'facebook'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [34]:
wordcounts_clean('df_counts_in1.pkl', 
               'df_counts_in2.pkl',
               remove_list,
               stopwords,
               'df_counts_in1.pkl',
               'df_counts_in2.pkl'
               )

In [35]:
df_counts_in_nostopwords = pd.read_pickle('df_counts_in2.pkl')

df_counts_in_nostopwords = dict(zip(df_counts_in_nostopwords[0], df_counts_in_nostopwords[1]))

for i in enumerate(Counter(df_counts_in_nostopwords).most_common(300)):
    print(i)

(0, ('government', 73660))
(1, ('time', 69275))
(2, ('state', 65463))
(3, ('new', 61042))
(4, ('people', 59907))
(5, ('come', 57492))
(6, ('like', 56214))
(7, ('work', 49861))
(8, ('company', 49197))
(9, ('country', 45834))
(10, ('use', 43461))
(11, ('high', 43275))
(12, ('minister', 41688))
(13, ('police', 37447))
(14, ('need', 37376))
(15, ('case', 37257))
(16, ('film', 36489))
(17, ('team', 34827))
(18, ('market', 34272))
(19, ('issue', 34170))
(20, ('world', 33566))
(21, ('include', 33561))
(22, ('report', 32387))
(23, ('add', 32019))
(24, ('look', 30884))
(25, ('times', 30683))
(26, ('good', 29899))
(27, ('day', 29800))
(28, ('start', 29145))
(29, ('share', 28755))
(30, ('city', 28615))
(31, ('help', 27982))
(32, ('bank', 27878))
(33, ('way', 27743))
(34, ('know', 27705))
(35, ('business', 27509))
(36, ('life', 27120))
(37, ('lead', 27102))
(38, ('want', 26979))
(39, ('woman', 26361))
(40, ('set', 26306))
(41, ('student', 26273))
(42, ('play', 26130))
(43, ('web', 26069))
(44, ('c

In [None]:
## Jamaica JM
read_sort_save('JM_domestic_Ngram_stopword_lematize.csv', 'df_jm.pkl')

In [None]:
wordcounts('df_jm.pkl', 'df_counts_jm1.pkl', 'df_counts_jm2.pkl')

In [None]:
remove_items = [
                'caribbean', 'jamaica', 'jamaican', 'st', 'jamaicans', 'kingston', 'montego', \
                'reggae', 'pnp'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [37]:
wordcounts_clean('df_counts_jm1.pkl', 
               'df_counts_jm2.pkl',
               remove_list,
               stopwords,
               'df_counts_jm1.pkl',
               'df_counts_jm2.pkl'
               )

In [38]:
df_counts_jm_nostopwords = pd.read_pickle('df_counts_jm2.pkl')

df_counts_jm_nostopwords = dict(zip(df_counts_jm_nostopwords[0], df_counts_jm_nostopwords[1]))

for i in enumerate(Counter(df_counts_jm_nostopwords).most_common(300)):
    print(i)

(0, ('time', 29117))
(1, ('minister', 29005))
(2, ('work', 27897))
(3, ('people', 27789))
(4, ('government', 26488))
(5, ('country', 25510))
(6, ('school', 25263))
(7, ('new', 23885))
(8, ('come', 22996))
(9, ('child', 21634))
(10, ('use', 21393))
(11, ('need', 21383))
(12, ('high', 20116))
(13, ('include', 19472))
(14, ('person', 19439))
(15, ('community', 18486))
(16, ('public', 18438))
(17, ('development', 18362))
(18, ('national', 18258))
(19, ('business', 18248))
(20, ('company', 16909))
(21, ('service', 16723))
(22, ('member', 16564))
(23, ('world', 16483))
(24, ('like', 16414))
(25, ('know', 16055))
(26, ('police', 15663))
(27, ('provide', 15315))
(28, ('way', 15208))
(29, ('student', 14842))
(30, ('place', 14703))
(31, ('programme', 14630))
(32, ('want', 14608))
(33, ('add', 14476))
(34, ('health', 14349))
(35, ('life', 14199))
(36, ('state', 14068))
(37, ('support', 14010))
(38, ('woman', 14009))
(39, ('international', 13786))
(40, ('right', 13734))
(41, ('told', 13574))
(42, 

In [None]:
## Kenya KE
read_sort_save('KE_domestic_Ngram_stopword_lematize.csv', 'df_ke.pkl')

In [None]:
wordcounts('df_ke.pkl', 'df_counts_ke1.pkl', 'df_counts_ke2.pkl')

In [None]:
remove_items = [
                'kenya', 'nairobi', 'raila', 'uhuru', 'mp', 'mombasa', 'odinga', 'kenyan', 'ruto', \
                'kenyatta' 
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [40]:
wordcounts_clean('df_counts_ke1.pkl', 
               'df_counts_ke2.pkl',
               remove_list,
               stopwords,
               'df_counts_ke1.pkl',
               'df_counts_ke2.pkl'
               )

In [41]:
df_counts_ke_nostopwords = pd.read_pickle('df_counts_ke2.pkl')

df_counts_ke_nostopwords = dict(zip(df_counts_ke_nostopwords[0], df_counts_ke_nostopwords[1]))

for i in enumerate(Counter(df_counts_ke_nostopwords).most_common(300)):
    print(i)

(0, ('government', 28224))
(1, ('country', 25395))
(2, ('people', 22643))
(3, ('time', 21596))
(4, ('president', 21091))
(5, ('county', 20705))
(6, ('come', 17559))
(7, ('work', 16792))
(8, ('new', 16562))
(9, ('use', 16317))
(10, ('police', 15603))
(11, ('national', 15497))
(12, ('public', 14465))
(13, ('court', 13989))
(14, ('high', 13657))
(15, ('like', 13510))
(16, ('service', 13347))
(17, ('need', 13321))
(18, ('include', 12640))
(19, ('school', 12497))
(20, ('want', 12479))
(21, ('know', 12080))
(22, ('report', 12067))
(23, ('state', 12015))
(24, ('woman', 11854))
(25, ('business', 11464))
(26, ('case', 11239))
(27, ('leader', 11147))
(28, ('lead', 10937))
(29, ('way', 10591))
(30, ('team', 10528))
(31, ('issue', 10478))
(32, ('add', 10347))
(33, ('company', 10235))
(34, ('world', 10164))
(35, ('start', 9931))
(36, ('home', 9868))
(37, ('good', 9857))
(38, ('life', 9820))
(39, ('officer', 9803))
(40, ('child', 9496))
(41, ('election', 9278))
(42, ('help', 9260))
(43, ('project', 

In [None]:
## Sri Lanka LK
read_sort_save('LK_domestic_Ngram_stopword_lematize.csv', 'df_lk.pkl')

In [None]:
wordcounts('df_lk.pkl', 'df_counts_lk1.pkl', 'df_counts_lk2.pkl')

In [None]:
remove_items = [
                'sri', 'lanka', 'colombo', 'lankan', 'tamil', 'rs', 'rajapaksa', 'sirisena', \
                'sinhala', 'buddhist', 'tamils', 'mahinda', 'ceylon'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [43]:
wordcounts_clean('df_counts_lk1.pkl', 
               'df_counts_lk2.pkl',
               remove_list,
               stopwords,
               'df_counts_lk1.pkl',
               'df_counts_lk2.pkl'
               )

In [44]:
df_counts_lk_nostopwords = pd.read_pickle('df_counts_lk2.pkl')

df_counts_lk_nostopwords = dict(zip(df_counts_lk_nostopwords[0], df_counts_lk_nostopwords[1]))

for i in enumerate(Counter(df_counts_lk_nostopwords).most_common(300)):
    print(i)

(0, ('country', 18584))
(1, ('government', 17051))
(2, ('people', 13226))
(3, ('president', 12330))
(4, ('minister', 11345))
(5, ('time', 10794))
(6, ('state', 9263))
(7, ('new', 9108))
(8, ('come', 7976))
(9, ('use', 7914))
(10, ('work', 7629))
(11, ('high', 7518))
(12, ('world', 7488))
(13, ('need', 7023))
(14, ('include', 6911))
(15, ('international', 6898))
(16, ('national', 6603))
(17, ('development', 6586))
(18, ('issue', 6155))
(19, ('political', 6095))
(20, ('right', 5762))
(21, ('public', 5722))
(22, ('place', 5598))
(23, ('provide', 5593))
(24, ('like', 5550))
(25, ('leader', 5514))
(26, ('report', 5469))
(27, ('power', 5396))
(28, ('party', 5373))
(29, ('police', 5193))
(30, ('business', 5187))
(31, ('group', 5125))
(32, ('company', 5104))
(33, ('area', 5086))
(34, ('project', 5055))
(35, ('foreign', 5034))
(36, ('bank', 4962))
(37, ('member', 4961))
(38, ('service', 4933))
(39, ('know', 4870))
(40, ('follow', 4855))
(41, ('general', 4823))
(42, ('lead', 4786))
(43, ('suppor

In [None]:
## Malaysia MY
read_sort_save('MY_domestic_Ngram_stopword_lematize.csv', 'df_my.pkl')

In [None]:
wordcounts('df_my.pkl', 'df_counts_my1.pkl', 'df_counts_my2.pkl')

In [None]:
remove_items = [
                'malaysia', 'bhd', 'malaysians', 'sabah', 'lumpur', 'umno', 'kuala', 'datuk', \
                'najib', 'bn', 'malay', 'pas', 'mohd'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [46]:
wordcounts_clean('df_counts_my1.pkl', 
               'df_counts_my2.pkl',
               remove_list,
               stopwords,
               'df_counts_my1.pkl',
               'df_counts_my2.pkl'
               )

In [47]:
df_counts_my_nostopwords = pd.read_pickle('df_counts_my2.pkl')

df_counts_my_nostopwords = dict(zip(df_counts_my_nostopwords[0], df_counts_my_nostopwords[1]))

for i in enumerate(Counter(df_counts_my_nostopwords).most_common(300)):
    print(i)

(0, ('government', 34692))
(1, ('country', 25729))
(2, ('people', 25653))
(3, ('new', 24631))
(4, ('time', 24115))
(5, ('use', 23905))
(6, ('minister', 21194))
(7, ('state', 20642))
(8, ('company', 18220))
(9, ('high', 18003))
(10, ('work', 17900))
(11, ('like', 17897))
(12, ('come', 17464))
(13, ('need', 17413))
(14, ('comment', 17125))
(15, ('right', 15180))
(16, ('public', 15155))
(17, ('group', 15099))
(18, ('report', 15070))
(19, ('include', 14931))
(20, ('market', 14687))
(21, ('price', 14334))
(22, ('want', 13752))
(23, ('issue', 13670))
(24, ('business', 12774))
(25, ('know', 12671))
(26, ('property', 12398))
(27, ('car', 12275))
(28, ('service', 12097))
(29, ('good', 12088))
(30, ('look', 12003))
(31, ('project', 11847))
(32, ('case', 11804))
(33, ('police', 11792))
(34, ('party', 11683))
(35, ('day', 11675))
(36, ('world', 11622))
(37, ('development', 11254))
(38, ('add', 11137))
(39, ('share', 11105))
(40, ('help', 10952))
(41, ('increase', 10840))
(42, ('local', 10755))
(43

In [None]:
## Nigeria NG
read_sort_save('NG_domestic_Ngram_stopword_lematize.csv', 'df_ng.pkl')

In [None]:
wordcounts('df_ng.pkl', 'df_counts_ng1.pkl', 'df_counts_ng2.pkl')

In [None]:
remove_items = [
                'nigeria', 'nigerians', 'lagos', 'abuja', 'pdp', 'apc', 'nigerian', 'buhari', \
                'boko', 'niger'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [49]:
wordcounts_clean('df_counts_ng1.pkl', 
               'df_counts_ng2.pkl',
               remove_list,
               stopwords,
               'df_counts_ng1.pkl',
               'df_counts_ng2.pkl'
               )

In [50]:
df_counts_ng_nostopwords = pd.read_pickle('df_counts_ng2.pkl')

df_counts_ng_nostopwords = dict(zip(df_counts_ng_nostopwords[0], df_counts_ng_nostopwords[1]))

for i in enumerate(Counter(df_counts_ng_nostopwords).most_common(300)):
    print(i)

(0, ('state', 166160))
(1, ('government', 101682))
(2, ('people', 87846))
(3, ('country', 77670))
(4, ('president', 63295))
(5, ('time', 53674))
(6, ('come', 50587))
(7, ('governor', 44600))
(8, ('national', 41904))
(9, ('know', 41285))
(10, ('like', 40458))
(11, ('federal', 39649))
(12, ('new', 39288))
(13, ('need', 39026))
(14, ('work', 36905))
(15, ('use', 36815))
(16, ('party', 34917))
(17, ('election', 33964))
(18, ('member', 33079))
(19, ('issue', 32753))
(20, ('company', 32635))
(21, ('world', 31263))
(22, ('service', 30768))
(23, ('public', 30192))
(24, ('want', 30159))
(25, ('bank', 30035))
(26, ('way', 29985))
(27, ('development', 29785))
(28, ('good', 29433))
(29, ('life', 29241))
(30, ('political', 28901))
(31, ('business', 28793))
(32, ('include', 28425))
(33, ('police', 28239))
(34, ('oil', 27870))
(35, ('security', 27829))
(36, ('report', 27810))
(37, ('court', 27563))
(38, ('general', 27363))
(39, ('high', 27235))
(40, ('area', 26830))
(41, ('power', 26596))
(42, ('lead

In [None]:
## New Zealand NZ
read_sort_save('NZ_domestic_Ngram_stopword_lematize.csv', 'df_nz.pkl')

In [None]:
wordcounts('df_nz.pkl', 'df_counts_nz1.pkl', 'df_counts_nz2.pkl')

In [None]:
remove_items = [
                'auckland', 'gt', 'nz', 'zealand', 'christchurch', 'maori', 'kiwi', 'zealanders', \
                'otago', 'te', ' ', 'ers', 'dunedin'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [52]:
wordcounts_clean('df_counts_nz1.pkl', 
               'df_counts_nz2.pkl',
               remove_list,
               stopwords,
               'df_counts_nz1.pkl',
               'df_counts_nz2.pkl'
               )

In [53]:
df_counts_nz_nostopwords = pd.read_pickle('df_counts_nz2.pkl')

df_counts_nz_nostopwords = dict(zip(df_counts_nz_nostopwords[0], df_counts_nz_nostopwords[1]))

for i in enumerate(Counter(df_counts_nz_nostopwords).most_common(300)):
    print(i)

(0, ('people', 75023))
(1, ('time', 66007))
(2, ('work', 62730))
(3, ('new', 56268))
(4, ('come', 45495))
(5, ('like', 44970))
(6, ('need', 43598))
(7, ('use', 40259))
(8, ('government', 36245))
(9, ('include', 34029))
(10, ('look', 33152))
(11, ('business', 33040))
(12, ('want', 32814))
(13, ('company', 32397))
(14, ('high', 32369))
(15, ('way', 31763))
(16, ('world', 31423))
(17, ('good', 31215))
(18, ('know', 30530))
(19, ('change', 29393))
(20, ('country', 29205))
(21, ('help', 27698))
(22, ('think', 27601))
(23, ('home', 27498))
(24, ('team', 26481))
(25, ('start', 26409))
(26, ('support', 25986))
(27, ('family', 25708))
(28, ('day', 25199))
(29, ('big', 24961))
(30, ('right', 24805))
(31, ('community', 24464))
(32, ('thing', 24348))
(33, ('school', 24341))
(34, ('place', 24039))
(35, ('year', 23799))
(36, ('life', 23647))
(37, ('service', 23369))
(38, ('market', 22720))
(39, ('number', 21371))
(40, ('report', 21238))
(41, ('health', 21161))
(42, ('public', 21048))
(43, ('council'

In [None]:
## Philippines PH
read_sort_save('PH_domestic_Ngram_stopword_lematize.csv', 'df_ph.pkl')

In [None]:
wordcounts('df_ph.pkl', 'df_counts_ph1.pkl', 'df_counts_ph2.pkl')

In [None]:
remove_items = [
                'f', 'fr', 'n', 'r', 'wh', 'are', 'ne', 'or', 'al', 'ut', 'manila', 'ver', \
                'philippine', 'filipino', 'pint', 'duterte', 'de', 'g', 'curt', 'd', 'filipinos', \
                'ff', 'barangay', 'll', 't', 'me', 's', 'sunstar', 'webster', 'a'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [None]:
from textblob import TextBlob
 
# a = "cmputr"           # incorrect spelling
# print("original text: "+str(a))

counts_ph2_fix = {}

for i, j in counts_ph2_copy.items():
    b = TextBlob(str(i))
    counts_ph2_fix[str(b.correct())] = j
    
# b = TextBlob(a)
 
# prints the corrected spelling
# print("corrected text: "+str(b.correct()))

In [32]:
wordcounts_clean('df_counts_ph1.pkl', 
               'df_counts_ph2.pkl',
               remove_list,
               stopwords,
               'df_counts_ph1.pkl',
               'df_counts_ph2.pkl'
               )

In [33]:
df_counts_ph_nostopwords = pd.read_pickle('df_counts_ph2.pkl')

df_counts_ph_nostopwords = dict(zip(df_counts_ph_nostopwords[0], df_counts_ph_nostopwords[1]))

for i in enumerate(Counter(df_counts_ph_nostopwords).most_common(300)):
    print(i)

(0, ('frm', 158563))
(1, ('nt', 126112))
(2, ('mre', 62080))
(3, ('city', 59707))
(4, ('ur', 53152))
(5, ('gvernment', 52619))
(6, ('wuld', 51676))
(7, ('time', 50058))
(8, ('ther', 48107))
(9, ('yu', 44998))
(10, ('abut', 44134))
(11, ('peple', 43767))
(12, ('president', 42575))
(13, ('tw', 40438))
(14, ('new', 39723))
(15, ('use', 38808))
(16, ('like', 36488))
(17, ('cuntry', 36090))
(18, ('nly', 34588))
(19, ('nw', 33001))
(20, ('natinal', 31992))
(21, ('right', 29393))
(22, ('thrugh', 27853))
(23, ('need', 27581))
(24, ('high', 27290))
(25, ('include', 27180))
(26, ('int', 27056))
(27, ('wrld', 26966))
(28, ('public', 26666))
(29, ('culd', 25690))
(30, ('team', 25665))
(31, ('shuld', 25658))
(32, ('filipin', 25214))
(33, ('year', 25165))
(34, ('way', 24861))
(35, ('hw', 24731))
(36, ('gd', 24485))
(37, ('help', 24097))
(38, ('lcal', 23775))
(39, ('want', 23754))
(40, ('befre', 23732))
(41, ('day', 23378))
(42, ('business', 23037))
(43, ('family', 22682))
(44, ('lead', 22375))
(45, 

In [60]:
## FIX SPELLING ERRORS IN DICTIONARY WITH TEXTBLOB function, REPLACE FILE

In [35]:
from textblob import TextBlob

# a = "cmputr"           # incorrect spelling
# print("original text: "+str(a))
# b = TextBlob(a)
 
# prints the corrected spelling
# print("corrected text: "+str(b.correct()))

df_counts_ph_nostopwords_fix = {}

for i, j in df_counts_ph_nostopwords.items():
    b = TextBlob(str(i))
    df_counts_ph_nostopwords_fix[str(b.correct())] = j

for i in enumerate(Counter(df_counts_ph_nostopwords_fix).most_common(300)):
    print(i)

with open('df_counts_ph2.pkl', 'wb') as handle:
    pickle.dump(df_counts_ph_nostopwords_fix, handle, protocol=pickle.HIGHEST_PROTOCOL)



(0, ('from', 158563))
(1, ('are', 62080))
(2, ('city', 59707))
(3, ('or', 53152))
(4, ('government', 52619))
(5, ('would', 51676))
(6, ('time', 50058))
(7, ('you', 44998))
(8, ('but', 44134))
(9, ('people', 43767))
(10, ('president', 42575))
(11, ('new', 39723))
(12, ('use', 38808))
(13, ('like', 36488))
(14, ('country', 36090))
(15, ('only', 34588))
(16, ('national', 31992))
(17, ('right', 29393))
(18, ('through', 27853))
(19, ('high', 27290))
(20, ('include', 27180))
(21, ('world', 26966))
(22, ('public', 26666))
(23, ('could', 25690))
(24, ('team', 25665))
(25, ('should', 25658))
(26, ('filipino', 25214))
(27, ('year', 25165))
(28, ('way', 24861))
(29, ('go', 24485))
(30, ('help', 24097))
(31, ('local', 23775))
(32, ('want', 23754))
(33, ('before', 23732))
(34, ('day', 23378))
(35, ('business', 23037))
(36, ('family', 22682))
(37, ('lead', 22375))
(38, ('start', 22371))
(39, ('add', 21958))
(40, ('life', 21449))
(41, ('group', 21184))
(42, ('fund', 21131))
(43, ('must', 20461))
(44,

In [None]:
## Pakistan PK
## Omitted due to insufficient articles from NOW data

In [None]:
## Singapore SG
read_sort_save('SG_domestic_Ngram_stopword_lematize.csv', 'df_sg.pkl')

In [None]:
wordcounts('df_sg.pkl', 'df_counts_sg1.pkl', 'df_counts_sg2.pkl')

In [63]:
remove_items = [
                's', 'singapore', 'singaporeans', 'asia', 'facebook'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

[None, None]

In [64]:
wordcounts_clean('df_counts_sg1.pkl', 
               'df_counts_sg2.pkl',
               remove_list,
               stopwords,
               'df_counts_sg1.pkl',
               'df_counts_sg2.pkl'
               )

In [65]:
df_counts_sg_nostopwords = pd.read_pickle('df_counts_sg2.pkl')

df_counts_sg_nostopwords = dict(zip(df_counts_sg_nostopwords[0], df_counts_sg_nostopwords[1]))

for i in enumerate(Counter(df_counts_sg_nostopwords).most_common(300)):
    print(i)

(0, ('new', 21763))
(1, ('company', 16779))
(2, ('time', 16261))
(3, ('work', 15171))
(4, ('use', 14346))
(5, ('people', 13069))
(6, ('market', 12765))
(7, ('like', 12335))
(8, ('world', 11838))
(9, ('include', 11545))
(10, ('come', 11514))
(11, ('business', 11365))
(12, ('need', 10567))
(13, ('service', 10499))
(14, ('high', 9779))
(15, ('look', 9331))
(16, ('help', 9277))
(17, ('add', 8999))
(18, ('report', 8435))
(19, ('government', 8432))
(20, ('country', 8394))
(21, ('share', 8369))
(22, ('team', 8266))
(23, ('want', 7735))
(24, ('technology', 7571))
(25, ('group', 7455))
(26, ('know', 7338))
(27, ('way', 7188))
(28, ('base', 7139))
(29, ('start', 7047))
(30, ('good', 6997))
(31, ('development', 6984))
(32, ('public', 6904))
(33, ('lead', 6874))
(34, ('provide', 6853))
(35, ('change', 6704))
(36, ('minister', 6697))
(37, ('global', 6648))
(38, ('issue', 6482))
(39, ('best', 6421))
(40, ('support', 6375))
(41, ('life', 6362))
(42, ('case', 6179))
(43, ('place', 6177))
(44, ('game',

In [None]:
## Tanzania TZ
read_sort_save('TZ_domestic_Ngram_stopword_lematize.csv', 'df_tz.pkl')

In [None]:
# One article greater than 1000000 characters, preventing NLP from compiling, article removed

# start_time = time.time()

# df_tz = pd.read_pickle('df_tz.pkl')

# counts_tz1 = {}
# counts_tz2 = {}

# wordfreq_tz1 = []
# wordfreq_tz2 = []

# for index, row in df_tz.iterrows():
#     if isinstance(row['article_text_Ngram_stopword_lemmatize'], str) and (len(row['article_text_Ngram_stopword_lemmatize']) < 1000000): 
#         text_data = row['article_text_Ngram_stopword_lemmatize'].lower()
#         new_text = replace_content(redict, text_data)
#         document = nlp(new_text)

#         [wordfreq_tz1.append(ent.text) for ent in document if ent.ent_type_]
#         [wordfreq_tz2.append(ent.text) for ent in document if not ent.ent_type_]
        
        
# counts_tz1 = Counter(word for word in wordfreq_tz1)
# counts_tz2 = Counter(word for word in wordfreq_tz2)

# elapsed_time = time.time() - start_time
# time.strftime("Runtime: %H Hours : %M Minutes : %S Seconds", time.gmtime(elapsed_time))

In [None]:
wordcounts('df_tz.pkl', 'df_counts_tz1.pkl', 'df_counts_tz2.pkl')

In [None]:
remove_items = [
                'salaam', 'dar', 'e', 'tanzania', 'zanzibar', 'tanzanians', 'prof'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [67]:
wordcounts_clean('df_counts_tz1.pkl', 
               'df_counts_tz2.pkl',
               remove_list,
               stopwords,
               'df_counts_tz1.pkl',
               'df_counts_tz2.pkl'
               )

In [68]:
df_counts_tz_nostopwords = pd.read_pickle('df_counts_tz2.pkl')

df_counts_tz_nostopwords = dict(zip(df_counts_tz_nostopwords[0], df_counts_tz_nostopwords[1]))

for i in enumerate(Counter(df_counts_tz_nostopwords).most_common(300)):
    print(i)

(0, ('government', 10537))
(1, ('country', 9670))
(2, ('people', 6117))
(3, ('use', 5641))
(4, ('development', 4884))
(5, ('project', 4395))
(6, ('include', 4188))
(7, ('need', 4073))
(8, ('work', 4071))
(9, ('new', 4009))
(10, ('service', 3995))
(11, ('minister', 3945))
(12, ('president', 3807))
(13, ('time', 3698))
(14, ('business', 3576))
(15, ('sector', 3523))
(16, ('school', 3431))
(17, ('national', 3425))
(18, ('company', 3355))
(19, ('public', 3353))
(20, ('area', 3326))
(21, ('region', 3293))
(22, ('health', 3149))
(23, ('support', 3116))
(24, ('increase', 3099))
(25, ('come', 3082))
(26, ('high', 2914))
(27, ('water', 2878))
(28, ('education', 2850))
(29, ('director', 2836))
(30, ('provide', 2830))
(31, ('child', 2663))
(32, ('member', 2624))
(33, ('issue', 2614))
(34, ('report', 2608))
(35, ('add', 2595))
(36, ('help', 2593))
(37, ('market', 2582))
(38, ('good', 2537))
(39, ('number', 2535))
(40, ('local', 2528))
(41, ('state', 2487))
(42, ('bank', 2464))
(43, ('note', 2444))

In [None]:
## United States US
read_sort_save('US_domestic_Ngram_stopword_lematize.csv', 'df_us.pkl')

In [None]:
wordcounts('df_us.pkl', 'df_counts_us1.pkl', 'df_counts_us2.pkl')

In [None]:
remove_items = [
                'trump', 'facebook', 'obama'
              ]

[remove_list.append(i) for i in remove_items if i not in remove_list]

In [70]:
wordcounts_clean('df_counts_us1.pkl', 
               'df_counts_us2.pkl',
               remove_list,
               stopwords,
               'df_counts_us1.pkl',
               'df_counts_us2.pkl'
               )

In [71]:
df_counts_us_nostopwords = pd.read_pickle('df_counts_us2.pkl')

df_counts_us_nostopwords = dict(zip(df_counts_us_nostopwords[0], df_counts_us_nostopwords[1]))

for i in enumerate(Counter(df_counts_us_nostopwords).most_common(300)):
    print(i)

(0, ('people', 92775))
(1, ('time', 91124))
(2, ('like', 90828))
(3, ('new', 74341))
(4, ('use', 70516))
(5, ('work', 67046))
(6, ('come', 59949))
(7, ('know', 58216))
(8, ('way', 51141))
(9, ('want', 47687))
(10, ('look', 45199))
(11, ('think', 45198))
(12, ('world', 43895))
(13, ('include', 43605))
(14, ('company', 43497))
(15, ('need', 43304))
(16, ('state', 42442))
(17, ('life', 42273))
(18, ('right', 41205))
(19, ('high', 41018))
(20, ('game', 40898))
(21, ('thing', 38953))
(22, ('help', 36122))
(23, ('report', 35216))
(24, ('woman', 33786))
(25, ('country', 33171))
(26, ('good', 32723))
(27, ('day', 32704))
(28, ('start', 32070))
(29, ('change', 31782))
(30, ('president', 31404))
(31, ('home', 30584))
(32, ('school', 30265))
(33, ('end', 30195))
(34, ('child', 29635))
(35, ('big', 29515))
(36, ('long', 29379))
(37, ('family', 29010))
(38, ('group', 28905))
(39, ('point', 28610))
(40, ('great', 28344))
(41, ('government', 28283))
(42, ('police', 28249))
(43, ('try', 27653))
(44, (

In [7]:
## South Africa ZA
## Omitted due to insufficient articles

In [None]:
df_counts_au_nostopwords = pd.read_pickle('df_counts_au2.pkl')
dict_au = dict(zip(df_counts_au_nostopwords[0], df_counts_au_nostopwords[1]))

In [23]:
df_counts_au_nostopwords = pd.read_pickle('df_counts_au2.pkl')
dict_au = dict(zip(df_counts_au_nostopwords[0], df_counts_au_nostopwords[1]))
dict_au = dict(Counter(dict_au).most_common(300))

df_counts_bd_nostopwords = pd.read_pickle('df_counts_bd2.pkl')
dict_bd = dict(zip(df_counts_bd_nostopwords[0], df_counts_bd_nostopwords[1]))
dict_bd = dict(Counter(dict_bd).most_common(300))

df_counts_ca_nostopwords = pd.read_pickle('df_counts_ca2.pkl')
dict_ca = dict(zip(df_counts_ca_nostopwords[0], df_counts_ca_nostopwords[1]))
dict_ca = dict(Counter(dict_ca).most_common(300))

df_counts_gb_nostopwords = pd.read_pickle('df_counts_gb2.pkl')
dict_gb = dict(zip(df_counts_gb_nostopwords[0], df_counts_gb_nostopwords[1]))
dict_gb = dict(Counter(dict_gb).most_common(300))

df_counts_gh_nostopwords = pd.read_pickle('df_counts_gh2.pkl')
dict_gh = dict(zip(df_counts_gh_nostopwords[0], df_counts_gh_nostopwords[1]))
dict_gh = dict(Counter(dict_gh).most_common(300))

df_counts_hk_nostopwords = pd.read_pickle('df_counts_hk2.pkl')
dict_hk = dict(zip(df_counts_hk_nostopwords[0], df_counts_hk_nostopwords[1]))
dict_hk = dict(Counter(dict_hk).most_common(300))

df_counts_ie_nostopwords = pd.read_pickle('df_counts_ie2.pkl')
dict_ie = dict(zip(df_counts_ie_nostopwords[0], df_counts_ie_nostopwords[1]))
dict_ie = dict(Counter(dict_ie).most_common(300))

df_counts_in_nostopwords = pd.read_pickle('df_counts_in2.pkl')
dict_in = dict(zip(df_counts_in_nostopwords[0], df_counts_in_nostopwords[1]))
dict_in = dict(Counter(dict_in).most_common(300))

df_counts_jm_nostopwords = pd.read_pickle('df_counts_jm2.pkl')
dict_jm = dict(zip(df_counts_jm_nostopwords[0], df_counts_jm_nostopwords[1]))
dict_jm = dict(Counter(dict_jm).most_common(300))

df_counts_ke_nostopwords = pd.read_pickle('df_counts_ke2.pkl')
dict_ke = dict(zip(df_counts_ke_nostopwords[0], df_counts_ke_nostopwords[1]))
dict_ke = dict(Counter(dict_ke).most_common(300))

df_counts_lk_nostopwords = pd.read_pickle('df_counts_lk2.pkl')
dict_lk = dict(zip(df_counts_lk_nostopwords[0], df_counts_lk_nostopwords[1]))
dict_lk = dict(Counter(dict_lk).most_common(300))

df_counts_my_nostopwords = pd.read_pickle('df_counts_my2.pkl')
dict_my = dict(zip(df_counts_my_nostopwords[0], df_counts_my_nostopwords[1]))
dict_my = dict(Counter(dict_my).most_common(300))

df_counts_ng_nostopwords = pd.read_pickle('df_counts_ng2.pkl')
dict_ng = dict(zip(df_counts_ng_nostopwords[0], df_counts_ng_nostopwords[1]))
dict_ng = dict(Counter(dict_ng).most_common(300))

df_counts_nz_nostopwords = pd.read_pickle('df_counts_nz2.pkl')
dict_nz = dict(zip(df_counts_nz_nostopwords[0], df_counts_nz_nostopwords[1]))
dict_nz = dict(Counter(dict_nz).most_common(300))

df_counts_ph_nostopwords = pd.read_pickle('df_counts_ph2.pkl')
dict_ph = df_counts_ph_nostopwords
dict_ph = dict(Counter(dict_ph).most_common(300))

df_counts_sg_nostopwords = pd.read_pickle('df_counts_sg2.pkl')
dict_sg = dict(zip(df_counts_sg_nostopwords[0], df_counts_sg_nostopwords[1]))
dict_sg = dict(Counter(dict_sg).most_common(300))

df_counts_tz_nostopwords = pd.read_pickle('df_counts_tz2.pkl')
dict_tz = dict(zip(df_counts_tz_nostopwords[0], df_counts_tz_nostopwords[1]))
dict_tz = dict(Counter(dict_tz).most_common(300))

df_counts_us_nostopwords = pd.read_pickle('df_counts_us2.pkl')
dict_us = dict(zip(df_counts_us_nostopwords[0], df_counts_us_nostopwords[1]))
dict_us = dict(Counter(dict_us).most_common(300))

In [24]:
dict_us

{'people': 92775,
 'time': 91124,
 'like': 90828,
 'new': 74341,
 'use': 70516,
 'work': 67046,
 'come': 59949,
 'know': 58216,
 'way': 51141,
 'want': 47687,
 'look': 45199,
 'think': 45198,
 'world': 43895,
 'include': 43605,
 'company': 43497,
 'need': 43304,
 'state': 42442,
 'life': 42273,
 'right': 41205,
 'high': 41018,
 'game': 40898,
 'thing': 38953,
 'help': 36122,
 'report': 35216,
 'woman': 33786,
 'country': 33171,
 'good': 32723,
 'day': 32704,
 'start': 32070,
 'change': 31782,
 'president': 31404,
 'home': 30584,
 'school': 30265,
 'end': 30195,
 'child': 29635,
 'big': 29515,
 'long': 29379,
 'family': 29010,
 'group': 28905,
 'point': 28610,
 'great': 28344,
 'government': 28283,
 'police': 28249,
 'try': 27653,
 'team': 27274,
 'found': 27233,
 'story': 26915,
 'lead': 26677,
 'place': 26677,
 'business': 26444,
 'write': 26191,
 'case': 26031,
 'play': 25866,
 'city': 25837,
 'told': 25809,
 'find': 24779,
 'best': 24669,
 'year': 24626,
 'lot': 24346,
 'number': 24

In [25]:
data = [dict_au,dict_bd,dict_ca,dict_gb,dict_gh,dict_hk,dict_ie,dict_in,dict_jm,dict_ke,dict_lk,dict_my,dict_ng,dict_nz,dict_ph,dict_sg,dict_tz,dict_us]

# all_countries_no_stopwords = pd.DataFrame.from_dict(data, orient='index', columns=['AU','BD','CA','GB','GH','HK','IE','IN','JM','KE','LK','MY','NG','NZ','PH','SG','TZ','US'])

# all_countries_no_stopwords

In [26]:
data

[{'time': 99918,
  'people': 83357,
  'like': 73990,
  'new': 69815,
  'work': 65096,
  'use': 63113,
  'come': 57492,
  'need': 51243,
  'government': 51129,
  'look': 49322,
  'think': 47609,
  'know': 46152,
  'way': 45194,
  'include': 45178,
  'game': 43195,
  'good': 41740,
  'want': 41397,
  'high': 39172,
  'market': 38863,
  'world': 38654,
  'company': 36958,
  'change': 36302,
  'thing': 34861,
  'big': 34666,
  'right': 34097,
  'business': 33743,
  'report': 32855,
  'team': 32075,
  'start': 31979,
  'day': 31683,
  'home': 31660,
  'information': 31310,
  'great': 30983,
  'play': 30863,
  'state': 30849,
  'life': 30432,
  'provide': 29711,
  'service': 29554,
  'point': 28701,
  'car': 28522,
  'long': 28221,
  'best': 27873,
  'country': 27651,
  'help': 27638,
  'year': 27623,
  'end': 26499,
  'place': 25950,
  'run': 25677,
  'family': 25601,
  'support': 25273,
  'public': 24968,
  'lot': 24948,
  'number': 24731,
  'lead': 24718,
  'player': 24558,
  'price': 244

In [27]:
all_countries_no_stopwords = pd.DataFrame.from_dict(data)
all_countries_no_stopwords.index = ['Australia AU', 'Bangladesh BD', 'Canada CA', 'United Kingdom GB',
                                    'Ghana GH','Hong Kong HK','Ireland IE','India IN',
                                    'Jamaica JM','Kenya KE','Sri Lanka LK','Malaysia MY',
                                    'Nigeria NG','New Zealand NZ','Philippines PH','Singapore SG',
                                    'Tanzania TZ','United States US']

all_countries_no_stopwords

Unnamed: 0,time,people,like,new,work,use,come,need,government,look,...,natural,meet,implementation,infrastructure,union,rural,port,white,character,effect
Australia AU,99918,83357,73990,69815,65096,63113,57492,51243.0,51129,49322.0,...,,,,,,,,,,
Bangladesh BD,12261,19966,7089,8614,11527,8819,9810,7700.0,20966,3217.0,...,,,,,,,,,,
Canada CA,101831,96695,78735,80842,78459,81684,69959,51774.0,49154,55700.0,...,,,,,,,,,,
United Kingdom GB,48544,48863,39139,37046,38795,32345,33054,23568.0,18847,24954.0,...,,,,,,,,,,
Ghana GH,16691,22788,9709,14518,14492,13881,14847,13132.0,26010,5758.0,...,,,,,,,,,,
Hong Kong HK,3051,3349,1585,2787,2053,1928,1723,1424.0,3738,987.0,...,,,,,,,,,,
Ireland IE,77019,61005,42155,47556,48931,72634,50201,28496.0,18515,31373.0,...,,,,,,,,,,
India IN,69275,59907,56214,61042,49861,43461,57492,37376.0,73660,30884.0,...,,,,,,,,,,
Jamaica JM,29117,27789,16414,23885,27897,21393,22996,21383.0,26488,11887.0,...,,,,,,,,,,
Kenya KE,21596,22643,13510,16562,16792,16317,17559,13321.0,28224,7517.0,...,,,,,,,,,,
