# Reading NIH data file
Initially, we need to load the data so we are able to look more into it.

In [1]:
import pandas as pd

dfAfrica = pd.read_csv("datasets/africa.csv", encoding="utf-8-sig")
dfAmericas = pd.read_csv("datasets/americas.csv", encoding="utf-8-sig")
dfEMed = pd.read_csv("datasets/mediterranean.csv", encoding="utf-8-sig")
dfEurope = pd.read_csv("datasets/europe.csv", encoding="utf-8-sig")
dfSEAsia = pd.read_csv("datasets/asia.csv", encoding="utf-8-sig")
dfWPacific = pd.read_csv("datasets/pacific.csv", encoding="utf-8-sig")

df = [dfAfrica, dfAmericas, dfEMed, dfEurope, dfSEAsia, dfWPacific]

result = pd.concat(df)

print result.head()

               GHO PUBLISHSTATE  YEAR REGION    AGEGROUP   SEX  Display Value  \
0  LIFE_0000000033    PUBLISHED  1990    AFR  AGE100PLUS  BTSX           60.0   
1  LIFE_0000000035    PUBLISHED  2013    AFR    AGE85-89  BTSX            4.1   
2  LIFE_0000000035    PUBLISHED  2000    AFR    AGE55-59  BTSX           18.8   
3  LIFE_0000000032    PUBLISHED  2012    AFR    AGE45-49  FMLE         3411.0   
4  LIFE_0000000035    PUBLISHED  1990    AFR    AGE25-29   MLE           38.8   

      Numeric  Low  High  Comments  
0    60.01235  NaN   NaN       NaN  
1     4.14085  NaN   NaN       NaN  
2    18.76440  NaN   NaN       NaN  
3  3410.96100  NaN   NaN       NaN  
4    38.79806  NaN   NaN       NaN  


We got data. Next step is cleaning certain columns, including:
AGEGROUP, SEX, GHO (values for specific metrics, explained below), and we drop PUBLISHSTATE, Low (empty), High (also empty), Comments, and Display Value (rounded Numeric column).

In [2]:
def cleanData(data):
    data.AGEGROUP.replace(("AGE100PLUS", "AGE95-99", "AGE90-94", "AGE85-89", 
                             "AGE80-84", "AGE75-79", "AGE70-74", "AGE65-69", 
                             "AGE60-64", "AGE55-59", "AGE50-54", "AGE45-49", 
                             "AGE40-44", "AGE35-39", "AGE30-34", "AGE25-29", 
                             "AGE20-24", "AGE15-19", "AGE10-14", "AGE5-9", 
                             "AGE1-4", "AGELT1"), 
                          (100, 95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 
                             45, 40, 35, 30, 25, 20, 15, 10, 5, 1, 0.5), 
                          inplace = True)
    
    data.SEX.replace(("BTSX", "FMLE", "MLE"), (2, 1, 0), inplace = True)
    
    # V not a continuous value, just for ease of referencing this column
    data.GHO.replace(("LIFE_0000000029", "LIFE_0000000030", "LIFE_0000000031", 
                      "LIFE_0000000032", "LIFE_0000000033", "LIFE_0000000034",
                      "LIFE_0000000035"),
                     (29, 30, 31, 32, 33, 34, 35), inplace = True)
    
    data.drop(["PUBLISHSTATE", "Low", "High", "Comments", "Display Value"], 
              inplace = True, axis = 1)
    
# based on Mack's code, it's golden
def regionCatSeparate(region, cat):
#     get dataframe consisting of onle both values for sexes
    bothSex = result[result.SEX == 2]
#     create a new dataframe with only the category
    GHOval = bothSex[bothSex.GHO == cat]
#     sort the ages from low to high
    GHOval.sort_index(by = ["AGEGROUP"], ascending = True, inplace = True)
#     create a new dataframe with only the certain region's data
    regionVal = GHOval[GHOval.REGION == region]

    return regionVal

# I want a list of all dataframes for easy iteration, building that now
def listData(data):
    dataframe = [[], [], [], [], [], [], []]
    region = ["AFR", "AMR", "SEAR", "EUR", "EMR", "WPR"]

    for cat in range(7):
        for area in region:
            dataframe[cat].append(regionCatSeparate(area, (cat + 29)))

    return dataframe

Meanings of values in GHO column:

29  nMx - age-specific death rate between ages x and x+n

30  nqx - probability of dying between ages x and x+n

31  lx - number of people left alive at age x

32  ndx - number of people dying between ages x and x+n

33  nLx - person-years lived between ages x and x+n

34  Tx - person-years lived above age x

35  ex - expectation of life at age x

In [3]:
cleanData(result)
allData = listData(cleanData)
# now have everything in one huge list
print allData

[[      GHO  YEAR REGION  AGEGROUP  SEX  Numeric
1008   29  2012    AFR       0.5    2  0.06553
231    29  2013    AFR       0.5    2  0.06384
1073   29  1990    AFR       0.5    2  0.11275
859    29  2000    AFR       0.5    2  0.10009
837    29  2012    AFR       1.0    2  0.00902
1602   29  2000    AFR       1.0    2  0.01722
327    29  2013    AFR       1.0    2  0.00858
1186   29  1990    AFR       1.0    2  0.02008
1711   29  2000    AFR       5.0    2  0.00596
1681   29  2013    AFR       5.0    2  0.00416
1831   29  2012    AFR       5.0    2  0.00426
336    29  1990    AFR       5.0    2  0.00667
166    29  1990    AFR      10.0    2  0.00345
1204   29  2000    AFR      10.0    2  0.00312
868    29  2013    AFR      10.0    2  0.00255
1004   29  2012    AFR      10.0    2  0.00261
280    29  1990    AFR      15.0    2  0.00412
1766   29  2012    AFR      15.0    2  0.00308
1604   29  2000    AFR      15.0    2  0.00368
1613   29  2013    AFR      15.0    2  0.00302
355    29  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace)


From this point and this data, we now want to plot the different values on a world map. The code below was from a tutorial (found here: http://scitools.org.uk/cartopy/docs/latest/tutorials/using_the_shapereader.html) and the map files can be found at http://thematicmapping.org/downloads/world_borders.php.

WIP :: Map downloads are timing out from gadm.org, so unable to plot

In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpreader
import itertools
import numpy as np

shapename = 'admin_0_countries'
countries_shp = shpreader.natural_earth(resolution='110m',
                                        category='cultural', name=shapename)

# some nice "earthy" colors
earth_colors = np.array([(199, 233, 192),
                                (161, 217, 155),
                                (116, 196, 118),
                                (65, 171, 93),
                                (35, 139, 69),
                                ]) / 255.
earth_colors = itertools.cycle(earth_colors)



ax = plt.axes(projection=ccrs.PlateCarree())
for country in shpreader.Reader(countries_shp).records():
    print country.attributes['name_long'], earth_colors.next()
    ax.add_geometries(country.geometry, ccrs.PlateCarree(),
                      facecolor=earth_colors.next(),
                      label=country.attributes['name_long'])

plt.show()

Afghanistan [ 0.78039216  0.91372549  0.75294118]
Angola [ 0.45490196  0.76862745  0.4627451 ]
Albania [ 0.1372549   0.54509804  0.27058824]
United Arab Emirates [ 0.63137255  0.85098039  0.60784314]
Argentina [ 0.25490196  0.67058824  0.36470588]
Armenia [ 0.78039216  0.91372549  0.75294118]
Antarctica [ 0.45490196  0.76862745  0.4627451 ]
French Southern and Antarctic Lands [ 0.1372549   0.54509804  0.27058824]
Australia [ 0.63137255  0.85098039  0.60784314]
Austria [ 0.25490196  0.67058824  0.36470588]
Azerbaijan [ 0.78039216  0.91372549  0.75294118]
Burundi [ 0.45490196  0.76862745  0.4627451 ]
Belgium [ 0.1372549   0.54509804  0.27058824]
Benin [ 0.63137255  0.85098039  0.60784314]
Burkina Faso [ 0.25490196  0.67058824  0.36470588]
Bangladesh [ 0.78039216  0.91372549  0.75294118]
Bulgaria [ 0.45490196  0.76862745  0.4627451 ]
Bahamas [ 0.1372549   0.54509804  0.27058824]
Bosnia and Herzegovina [ 0.63137255  0.85098039  0.60784314]
Belarus [ 0.25490196  0.67058824  0.36470588]
Beli

axes property.  A removal date has not been set.


UnicodeDecodeError: 'ascii' codec can't decode byte 0xf4 in position 1: ordinal not in range(128)