In [1]:
import numpy as np
import pandas as pd
import webbrowser as wb
from bs4 import BeautifulSoup
import requests

from pandas import Series, DataFrame
from numpy.random import randn

countries = DataFrame(index=['switzerland', 'Netherlands','United_States', 'Canada', 'Australia', 'New_Zealand', 'Norway', 'United_Kingdom', 
                            'Germany', 'Belgium', 'Sweden', 'France', 'Italy', 'Danemark', 'Finland', 'Turkey', 'Iran'])

link = 'https://en.wikipedia.org/wiki/{0}'
country_list_links = Series(['List_of_countries_by_Human_Development_Index', 'List_of_countries_by_income_equality','List_of_countries_by_GDP_(PPP)_per_capita','List_of_countries_by_real_GDP_growth_rate'
                            'List_of_countries_by_GDP_(nominal)_per_capita','List_of_countries_and_dependencies_by_population','List_of_countries_and_dependencies_by_population','List_of_countries_by_average_wage'
                            ,'Satisfaction_with_Life_Index','Happy_Planet_Index','List_of_countries_by_unemployment_rate','List_of_countries_by_job_security','List_of_countries_by_inflation_rate'
                        ,'Ease_of_doing_business_index','Index_of_Economic_Freedom','List_of_countries_by_tax_rates','List_of_minimum_wages_by_country','List_of_countries_by_past_and_projected_GDP_(nominal)_per_capita',
                             'List_of_countries_by_GDP_growth_1980–2010','List_of_countries_by_GDP_(real)_per_capita_growth_rate','Global_Peace_Index'])
links = []
for i in range(len(country_list_links)):
    links.append(link.format(country_list_links[i]))

In [2]:
def getRawTables(link):
    """extract tables of a page"""
    # Request content from web page
    result = requests.get(link)
    c = result.content

    # Set as Beautiful Soup Object
    soup = BeautifulSoup(c)
    #print(soup)
    rawTables = soup.findAll("table",{'class':'wikitable'})
    print(len(rawTables))
    return rawTables

In [3]:
def getRawDataFromTable(table):
    """extract raw datas of cells in one table"""
    cells=table.findAll("td")
    
    data = []
    for i in range(len(cells)):
        link = cells[i].findAll("a")
        if len(link) > 0:
            data.append(link[0].text)
        else:
            data.append(cells[i].text)
            
    return data

In [4]:
def getColumnsFromTable(table):
    """extract column names for each table"""
    temp = table.findAll("th")
    
    columns = []
    for i in range(len(temp)):
        columns.append(temp[i].text)
        
    return columns

In [5]:
def formDataFrame(table):
    """function that gets table and call to other function to form a dataframe"""
    rawData = Series(getRawDataFromTable(table))
    columns = getColumnsFromTable(table)
    columnCount = len(columns)
    rowCount = len(rawData)/columnCount
    
    res = DataFrame(rawData.reshape(rowCount, columnCount), columns = columns)
    
    return res

In [13]:
def extractTables(link):
    tables = getRawTables(link)
    
    dfTables = []
    
    #dfTables.append(formDataFrame(tables[0]))
    
    for i in range(len(tables)):
            dfTables.append(formDataFrame(tables[i]))
            
    return dfTables

In [24]:
def desiredCountriesData(partLink):
    """extract data for those countries that I'm looking for"""
    link = 'https://en.wikipedia.org/wiki/{0}'
    countries = DataFrame(index=['Switzerland', 'Netherlands','United States', 'Canada', 'Australia', 'New Zealand', 'Norway', 'United_Kingdom', 
                            'Germany', 'Belgium', 'Sweden', 'France', 'Italy', 'Danemark', 'Finland', 'Turkey', 'Iran'])
    
    link = link.format(partLink)
    print(link)
    wb.open(link)
    tables = extractTables(link)
    
    res = []
    for i in range(len(tables)):
        res.append(tables[i][tables[i]['Country'].isin(countries.index)])
    
    
    return res

In [44]:
#[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
#[2]
res = desiredCountriesData(country_list_links[2])
for i in range(len(res)):
    #print('{0}_{1}'.format(country_list_links[1], i))
    print(res[i])

https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita
3
   Rank        Country    Int$
5     6         Norway  66,937
8     9    Switzerland  58,087
10   10  United States  54,597
14   14    Netherlands  47,355
15   15      Australia  46,433
17   17         Sweden  45,986
18   18        Germany  45,888
20   20         Canada  44,843
23   23        Belgium  42,973
24   24         France  40,375
25   25        Finland  40,347
29   29          Italy  35,486
31   31    New Zealand  35,152
61   61         Turkey  19,610
70   70           Iran  17,114
   Rank        Country    Int$  Year
6     6         Norway  64,406  2013
8     8    Switzerland  56,950  2013
12   10  United States  53,042  2013
13   11    Netherlands  46,162  2013
17   15         Sweden  44,658  2013
18   16        Germany  43,884  2013
21   19      Australia  43,202  2013
22   20         Canada  42,753  2013
24   22        Belgium  41,575  2013
25   23        Finland  39,740  2013
27   25         France 

In [45]:
gdp = res[0]
for i in range(len(gdp['Int$'].values)):
    gdp['Int$'].values[i] = int(gdp['Int$'].values[i].replace(',', ''))

In [47]:
gdp = gdp.drop('Rank',1)

In [48]:
gdp

Unnamed: 0,Country,Int$
5,Norway,66937
8,Switzerland,58087
10,United States,54597
14,Netherlands,47355
15,Australia,46433
17,Sweden,45986
18,Germany,45888
20,Canada,44843
23,Belgium,42973
24,France,40375


In [50]:
countries = countries.merge(gdp, on='Country')

In [51]:
countries

Unnamed: 0,Rank,Country,Int$_x,Int$_y


In [57]:
gdp_pp = DataFrame(gdp['Int$'], index = gdp['Country'],  columns = ['GDP_PPP_PerCapita'])
gdp_pp

Unnamed: 0_level_0,GDP_PPP_PerCapita
Country,Unnamed: 1_level_1
Norway,
Switzerland,
United States,
Netherlands,
Australia,
Sweden,
Germany,
Canada,
Belgium,
France,


In [56]:
gdp['Country']

5            Norway
8       Switzerland
10    United States
14      Netherlands
15        Australia
17           Sweden
18          Germany
20           Canada
23          Belgium
24           France
25          Finland
29            Italy
31      New Zealand
61           Turkey
70             Iran
Name: Country, dtype: object