# Scraping Population from Wikipedia

In [1]:
# http://lxml.de/lxmlhtml.html
import requests
from lxml.html import fromstring, parse
from itertools import islice

# http://stackoverflow.com/a/1779324/7782
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

url = "https://en.wikipedia.org/w/index.php?title=List_of_countries_by_population_(United_Nations)&oldid=590438477"
page = requests.get(url).content.decode("UTF-8")

doc = fromstring(page)

def parse_rank(col):
    try:
        rank = int(col.text)
        return rank
    except:
        return None

def parse_name(col):
    try:
        # find all the anchors and if href points is the form "/wiki"
        name = "; ".join([a.text for a in col.findall(".//a") if a.attrib["href"].startswith("/wiki/")])
        return name
    except:
        return None

def parse_pop(col):
    return locale.atoi(col.text)


def country_by_pop():
    
    for row in islice(doc.xpath("""//*[@id="mw-content-text"]/table[1]/tr"""),2, None):
        cols = row.findall(".//td")
        yield (parse_rank(cols[0]), parse_name(cols[1]), parse_pop(cols[2]))
    
for (i, row) in enumerate(islice(country_by_pop(), None)):
    print i, 
    for col in row:
        if type(col) == 'unicode':
            print col.encode("UTF-8"), 
        else:
            print col, 
    print

0 1 China 1385566537
1 2 India 1252139596
2 3 United States 320050716
3 4 Indonesia 249865631
4 5 Brazil 200361925
5 6 Pakistan 182142594
6 7 Nigeria 173615345
7 8 Bangladesh 156594962
8 9 Russia 142833689
9 10 Japan 127143577
10 11 Mexico 122332399
11 12 Philippines 98393574
12 13 Ethiopia 94100756
13 14 Vietnam 91679733
14 15 Germany 82726626
15 16 Egypt 82056378
16 17 Iran 77447168
17 18 Turkey 74932641
18 19 Congo, Democratic Republic of the 67513677
19 20 Thailand 67010502
20 21 France 64291280
21 22 United Kingdom 63136265
22 23 Italy 60990277
23 24 Myanmar 53259018
24 25 South Africa 52776130
25 26 Korea, South 49262698
26 27 Tanzania 49253126
27 28 Colombia 48321405
28 29 Spain 46926963
29 30 Ukraine 45238805
30 31 Kenya 44353691
31 32 Argentina 41446246
32 33 Algeria 39208194
33 34 Poland 38216635
34 35 Sudan 37964306
35 36 Uganda 37578876
36 37 Canada 35181704
37 38 Iraq 33765232
38 39 Morocco 33008150
39 40 Afghanistan 30551674
40 41 Venezuela 30405207
41 42 Peru 30375603
42

In [2]:
import json
s = json.dumps([row for row in country_by_pop()], ensure_ascii=True)

In [3]:
type(s)

str

In [4]:
print s

[[1, "China", 1385566537], [2, "India", 1252139596], [3, "United States", 320050716], [4, "Indonesia", 249865631], [5, "Brazil", 200361925], [6, "Pakistan", 182142594], [7, "Nigeria", 173615345], [8, "Bangladesh", 156594962], [9, "Russia", 142833689], [10, "Japan", 127143577], [11, "Mexico", 122332399], [12, "Philippines", 98393574], [13, "Ethiopia", 94100756], [14, "Vietnam", 91679733], [15, "Germany", 82726626], [16, "Egypt", 82056378], [17, "Iran", 77447168], [18, "Turkey", 74932641], [19, "Congo, Democratic Republic of the", 67513677], [20, "Thailand", 67010502], [21, "France", 64291280], [22, "United Kingdom", 63136265], [23, "Italy", 60990277], [24, "Myanmar", 53259018], [25, "South Africa", 52776130], [26, "Korea, South", 49262698], [27, "Tanzania", 49253126], [28, "Colombia", 48321405], [29, "Spain", 46926963], [30, "Ukraine", 45238805], [31, "Kenya", 44353691], [32, "Argentina", 41446246], [33, "Algeria", 39208194], [34, "Poland", 38216635], [35, "Sudan", 37964306], [36, "Ugan

In [5]:
# https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json

# read population in
import json
import requests

pop_json_url = "https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json"
pop_list= requests.get(pop_json_url).json()
pop_list

[[1, u'China', 1385566537],
 [2, u'India', 1252139596],
 [3, u'United States', 320050716],
 [4, u'Indonesia', 249865631],
 [5, u'Brazil', 200361925],
 [6, u'Pakistan', 182142594],
 [7, u'Nigeria', 173615345],
 [8, u'Bangladesh', 156594962],
 [9, u'Russia', 142833689],
 [10, u'Japan', 127143577],
 [11, u'Mexico', 122332399],
 [12, u'Philippines', 98393574],
 [13, u'Ethiopia', 94100756],
 [14, u'Vietnam', 91679733],
 [15, u'Germany', 82726626],
 [16, u'Egypt', 82056378],
 [17, u'Iran', 77447168],
 [18, u'Turkey', 74932641],
 [19, u'Congo, Democratic Republic of the', 67513677],
 [20, u'Thailand', 67010502],
 [21, u'France', 64291280],
 [22, u'United Kingdom', 63136265],
 [23, u'Italy', 60990277],
 [24, u'Myanmar', 53259018],
 [25, u'South Africa', 52776130],
 [26, u'Korea, South', 49262698],
 [27, u'Tanzania', 49253126],
 [28, u'Colombia', 48321405],
 [29, u'Spain', 46926963],
 [30, u'Ukraine', 45238805],
 [31, u'Kenya', 44353691],
 [32, u'Argentina', 41446246],
 [33, u'Algeria', 3920819

In [6]:
world_pop = sum([r[2] for r in pop_list])
world_pop

7162119434

In [7]:
# http://stackoverflow.com/a/15889203/7782
def cumsum(lis):
    total = 0
    for x in lis:
        total += x
        yield total


In [8]:
cum_pop = list(cumsum((r[2] for r in pop_list)))
cum_pop

[1385566537,
 2637706133,
 2957756849,
 3207622480,
 3407984405,
 3590126999,
 3763742344,
 3920337306,
 4063170995,
 4190314572,
 4312646971,
 4411040545,
 4505141301,
 4596821034,
 4679547660,
 4761604038,
 4839051206,
 4913983847,
 4981497524,
 5048508026,
 5112799306,
 5175935571,
 5236925848,
 5290184866,
 5342960996,
 5392223694,
 5441476820,
 5489798225,
 5536725188,
 5581963993,
 5626317684,
 5667763930,
 5706972124,
 5745188759,
 5783153065,
 5820731941,
 5855913645,
 5889678877,
 5922687027,
 5953238701,
 5983643908,
 6014019511,
 6043736476,
 6072670578,
 6101499448,
 6129296905,
 6155201503,
 6181035255,
 6205930735,
 6230338116,
 6253680669,
 6277010441,
 6299935292,
 6322189251,
 6344087312,
 6365785897,
 6387257515,
 6408530743,
 6428846829,
 6446678099,
 6464297807,
 6481232646,
 6497991875,
 6514432461,
 6530795028,
 6546532906,
 6562001109,
 6577302759,
 6592437928,
 6606976568,
 6621126216,
 6635259496,
 6648084810,
 6659861332,
 6671606521,
 6682902694,
 6694168323,

In [9]:
import bisect
import random

In [10]:
# http://docs.python.org/2/library/bisect.html
bisect.bisect_left(cum_pop,world_pop/2)

5

In [11]:
float(cum_pop[5])/world_pop

0.5012660054169099

In [12]:
len(cum_pop)

233

In [13]:
pop_list[0][1]

u'China'

In [14]:
from itertools import repeat
from collections import Counter

def random_country_weighted_by_pop():
    while True:
        yield pop_list[bisect.bisect_left(cum_pop,random.randint(1,world_pop))][1]
        
Counter(islice(random_country_weighted_by_pop(),5))

Counter({u'Turkey': 1, u'Indonesia': 1, u'India': 1, u'Bangladesh': 1, u'Myanmar': 1})

# CIA Handbook

In [15]:
import requests
import locale
import json

locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

cia_url = "https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt"
content = requests.get(cia_url).content

In [18]:
[r.split("\t") for r in content.strip().split("\r")]

[['1      China                                              1,355,692,576                                     '],
 ['\n2      India                                              1,236,344,631                                     '],
 ['\n3      European Union                                     511,434,812                                       '],
 ['\n4      United States                                      318,892,103                                       '],
 ['\n5      Indonesia                                          253,609,643                                       '],
 ['\n6      Brazil                                             202,656,788                                       '],
 ['\n7      Pakistan                                           196,174,380                                       '],
 ['\n8      Nigeria                                            177,155,754                                       '],
 ['\n9      Bangladesh                                         166

In [17]:
cia_pop_list = [(int(x[0]), x[1], locale.atoi(x[2])) for x in [r.split("\t") for r in content.strip().split("\r")]]
cia_pop_list

ValueError: invalid literal for int() with base 10: '1      China                                              1,355,692,576                                     '

In [None]:
print json.dumps(cia_pop_list)

In [None]:
# https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json
# https://gist.github.com/rdhyee/8530164
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt


import json
import requests

cia_json_url = "https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json"
cia_list= requests.get(cia_json_url).json()
cia_list

In [None]:
cia_world_pop = sum([r[2] for r in cia_list if r[1] != 'European Union'])
cia_world_pop

In [None]:
cia_world_pop, world_pop, cia_world_pop/float(world_pop)

# Comparing two lists

In [None]:
# set of entities for Wikipedia
wk_entities = set([r[1] for r in pop_list])
wk_entities

In [None]:
cia_entities = set([r[1] for r in cia_list])

In [None]:
len(wk_entities), len(cia_entities)

In [None]:
# http://docs.python.org/2/library/stdtypes.html#set
# intersection
len(wk_entities & cia_entities)

In [None]:
# symmetric diff
wk_entities ^ cia_entities

In [None]:
wk_entities - cia_entities

In [None]:
cia_entities - wk_entities

In [None]:
len(wk_entities or cia_entities)