In [1]:
import pandas as pd
import numpy as np
from ddf_utils import ddf_reader as dr

In [2]:
dr.config.DDF_SEARCH_PATH = '/Users/semio/src/work/Gapminder/'

In [3]:
old_dataset = dr.DDF('ddf--gapminder--gapminder_world')
new_dataset = dr.DDF('ddf--gapminder--systema_globalis')
wdi = dr.DDF('ddf--world_bank--world_development_indicators')

In [4]:
def get_comp_df(indicator):
    '''get dataframes from old and new datasets, and combine them into one dataframe'''
    gw = old_dataset.get_datapoint_df(indicator)
    sg = new_dataset.get_datapoint_df(indicator)
    gw = gw.rename(columns={indicator: 'gw'})
    sg = sg.rename(columns={indicator: 'sg'})
    comp = pd.concat([gw, sg], axis=1)
    
    return comp

In [5]:
def rval(indicator):
    '''calculation the R value'''
    comp = get_comp_df(indicator)
    return comp.corr().ix['gw', 'sg']

In [6]:
indicator = 'agricultural_land_percent_of_land_area'

In [7]:
get_comp_df(indicator).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gw,sg
geo,time,Unnamed: 2_level_1,Unnamed: 3_level_1
abw,1961,11.11,11.11111
abw,1962,11.11,11.11111
abw,1963,11.11,11.11111
abw,1964,11.11,11.11111
abw,1965,11.11,11.11111


In [9]:
rval(indicator)  # the r value for `agricultural_land_percent_of_land_area`

0.99293955301907588

In [10]:
# Here are all WDI indicators need to compare

indicator_list = '''agricultural_land_percent_of_land_area
agriculture_percent_of_gdp
aid_received_per_person_current_us
aid_received_percent_of_gni
aid_received_total_us_inflation_adjusted
alternative_gdp_per_capita_ppp_wb
alternative_poverty_percent_below_nationally_defined_poverty
armed_forces_personnel_percent_of_labor_force
armed_forces_personnel_total
arms_exports_us_inflation_adjusted
arms_imports_us_inflation_adjusted
births_attended_by_skilled_health_staff_percent_of_total
broadband_subscribers
broadband_subscribers_per_100_people
cell_phones_per_100_people
cell_phones_total
children_out_of_school_primary
children_out_of_school_primary_female
children_out_of_school_primary_male
co2_intensity_of_economic_output_kg_co2_per_2005_ppp_of_gdp
contraceptive_use_percent_of_women_ages_15_49
debt_servicing_costs_percent_of_exports_and_net_income_from_abroad
debt_to_foreigners_by_public_and_private_percent_of_gni
electricity_use_per_person
electricity_use_total
energy_use_per_person
energy_use_total
expenditure_per_student_primary_percent_of_gdp_per_person
expenditure_per_student_secondary_percent_of_gdp_per_person
expenditure_per_student_tertiary_percent_of_gdp_per_person
exports_percent_of_gdp
exports_unit_value_index_2000100
external_debt_total_us_not_inflation_adjusted
extreme_poverty_percent_people_below_125_a_day
fixed_line_and_mobile_phone_subscribers_per_100_people
foreign_direct_investment_net_inflows_percent_of_gdp
foreign_direct_investment_net_outflows_percent_of_gdp
forest_area_sq_km
gdppercapita_growth_percent_per_year
gdppercapita_us_inflation_adjusted
gnipercapita_atlasmethod_current_us
gnipercapita_constant_2000_us
gnipercapita_ppp_current_international
high_technology_exports_percent_of_manufactured_exports
imports_percent_of_gdp
imports_unit_value_index_2000100
improved_sanitation_overall_access_percent
improved_sanitation_rural_access_percent
improved_sanitation_urban_access_percent
improved_water_source_overall_access_percent
improved_water_source_rural_access_percent
improved_water_source_urban_access_percent
income_share_of_2nd_poorest_20percent
income_share_of_2nd_richest_20percent
income_share_of_middle_20percent
income_share_of_poorest_10percent
income_share_of_poorest_20percent
income_share_of_richest_10percent
income_share_of_richest_20percent
industry_percent_of_gdp
inequality_index_gini
inflation_annual_percent
internet_users_per_100_people
internet_users_total_number
investments_percent_of_gdp
malnutrition_weight_for_age_percent_of_children_under_5
market_value_of_listed_companies_percent_of_gdp
medical_doctors_per_1000_people
merchandise_trade_percent_of_gdp
military_expenditure_percent_of_gdp
net_barter_terms_of_trade_2000_100
population_growth_annual_percent
population_in_urban_agglomerations_m_1_million_percent_of_total
poverty_percent_people_below_2_a_day
present_value_of_debt_percent_of_gni
primary_completion_rate_total_percent_of_relevant_age_group
primary_school_completion_percent_of_boys
primary_school_completion_percent_of_girls
pump_price_for_gasoline_us_per_liter
ratio_of_girls_to_boys_in_primary_and_secondary_education_perc
ratio_of_young_literate_females_to_males_percent_ages_15_24
rural_poverty_percent_rural_people_below_national_rural
services_percent_of_gdp
surface_area_sq_km
tax_revenue_percent_of_gdp
total_gdp_us_inflation_adjusted
total_gni_ppp_current_international
total_reserves_percent_of_debt_to_foreigners
trade_balance_percent_of_gdp
trade_balance_us_not_inflation_adjusted
urban_population
urban_population_growth_annual_percent
urban_population_percent_of_total
urban_poverty_percent_urban_people_below_national_urban
'''

In [11]:
# overall r value for each indicator

rvalues = dict()

for i in indicator_list.split('\n')[:-1]:
    #print(rval(i))
    rvalues[i] = rval(i)

In [23]:
pd.DataFrame.from_dict(rvalues, orient='index').sort_values(by=0).head()

Unnamed: 0,0
foreign_direct_investment_net_inflows_percent_of_gdp,0.295232
urban_poverty_percent_urban_people_below_national_urban,0.376169
foreign_direct_investment_net_outflows_percent_of_gdp,0.399264
aid_received_percent_of_gni,0.539419
alternative_poverty_percent_below_nationally_defined_poverty,0.741762


In [24]:
# calculate per geo statistics

In [26]:
def per_geo_rval(indicator):
    comp = get_comp_df(indicator)
    res = comp.groupby(level=0).apply(lambda x: x.corr(min_periods=5).ix['gw', 'sg']).sort_index()
    res.name = indicator
    return res

In [27]:
# per geo rvalue
res = []

for i in indicator_list.split('\n')[:-1]:
    res.append(per_geo_rval(i))
        
result = pd.concat(res, axis=1)

In [28]:
result.to_csv('/Users/semio/Desktop/res.csv')

In [61]:
# overall r-values for each geo

res = []

for i in indicator_list.split('\n')[:-1]:
    comp = get_comp_df(i)
    comp = comp.reset_index()
    comp['indicator'] = i
    comp = comp.set_index(['geo', 'time', 'indicator'])
    res.append(comp)
    
all_indicators_df = pd.concat(res)

In [68]:
geo_rvals = all_indicators_df.groupby(level=0).apply(lambda x: x.corr().ix['gw', 'sg'])

In [70]:
geo_rvals.sort_values().head()

geo
nru   -0.995431
abw    0.593955
vut    0.876868
gnq    0.904568
kir    0.905109
dtype: float64

for the result, see https://docs.google.com/spreadsheets/d/1z6Qlmti2u-135ozclHeK9W7rZ5Rw0YGeI1fjcOPgl8E/edit#gid=945342426

In [34]:
# check if geo mapping is wrong in some low r-value cases

In [35]:
geo = dr.DDF('ddf--gapminder--geo_entity_domain')

In [36]:
geo_gap = geo.get_entities()['country']

In [71]:
geo_gap.ix['abw']

gwid                              i12
name                            Aruba
world_6region                 america
income_groups             high_income
landlocked                  coastline
g77_and_oecd_countries         others
main_religion_2008          christian
gapminder_list                  Aruba
alternative_1                     NaN
alternative_2                     NaN
alternative_3                     NaN
alternative_4_cdiac             Aruba
pandg                             NaN
god_id                             AW
alt_5                             NaN
upper_case_name                 ARUBA
iso3166_1_alpha2                   AW
iso3166_1_alpha3                  ABW
iso3166_1_numeric                 533
iso3166_2                         NaN
unicode_region_subtag              AW
arb1                              NaN
arb2                              NaN
arb3                              NaN
arb4                              NaN
arb5                              NaN
arb6        

In [38]:
geo_wdi = wdi.get_entities()['country']

In [72]:
for i, v in geo_wdi.iterrows():
    if 'Aruba' in v.values:
        print(i)

abw


In [73]:
geo_wdi.ix['abw']

country_code                                                                                       ABW
short_name                                                                                       Aruba
table_name                                                                                       Aruba
long_name                                                                                        Aruba
2_alpha_code                                                                                        AW
currency_unit                                                                            Aruban florin
special_notes                                        SNA data for 2000-2011 are updated from offici...
region                                                                       Latin America & Caribbean
income_group                                                                               High income
wb_2_code                                                                