# Examples

Let's see if we can answer some questions with Pandas.

In [None]:
import pandas as pd

- What country has the great hand washing percentage total?
- What country has the lowest?
- What country has the great difference in percentage between their Urban and Rural locations?
- What country has the lowest difference in percentage between their Urban and Rural locations?
- What country has seen the greatest increase in total percentage in consecutive years?
- What is the average "Total" percentage over the past 5 years?

In [None]:
# Population with basic handwashing facilities at home (%)
df = pd.read_csv(
    'WSH_HYGIENE_BASIC.csv', 
    header=[0, 2],
    index_col=0,
)  
df.head()

In [None]:
# What country has the great hand washing percentage total?
df.loc[:, '2015'].head()

In [None]:
df.loc[:, ('2015', 'Total')].head()

In [None]:
df.loc[:, ('2015', 'Total')].max()  # hmm, no country? Well, there might be multiple that have same.

In [None]:
slice_ = df.loc[:, ('2015', 'Total')] == df.loc[:, ('2015', 'Total')].max()
df.loc[slice_, ('2015', 'Total')]

In [None]:
# Lowest? 
slice_ = df.loc[:, ('2015', 'Total')] == df.loc[:, ('2015', 'Total')].min()
df.loc[slice_, ('2015', 'Total')]

In [None]:
# What country has the great difference in percentage between their Urban and Rural locations?
df.loc[:, '2015'].head()

In [None]:
df_diff = df.loc[:, ('2015', 'Urban')] - df.loc[:, ('2015', 'Rural')]  # Make a new df! Don't overwrite your original.
df_diff.head()

In [None]:
df_diff[df_diff == df_diff.max()]

In [None]:
df_diff[df_diff == df_diff.min()]  # São Tomé and Príncipe is a West African Island

In [None]:
# What country has seen the greatest increase in total percentage in consecutive years?
index = pd.IndexSlice[:, 'Total']
df_totals = df.loc[:, index]
df_totals.head()

In [None]:
df_totals = df_totals.droplevel(1, axis=1)
df_totals.head()

In [None]:
df_totals.diff(axis=1).head()  # new method!
# Need to keep in mind that negative indicates a positive differences

In [None]:
df_totals.diff(axis=1).min(axis=1).head()

In [None]:
df_max_diff = df_totals.diff(axis=1).min(axis=1)
df_max_diff.head()

In [None]:
series_min = df_max_diff[df_max_diff == df_max_diff.min()]
series_min.head()

In [None]:
series_min.index

In [None]:
df_totals.diff(axis=1).loc[series_min.index]

In [None]:
df_madagascar = df_totals.diff(axis=1).loc[series_min.index].T
df_madagascar.head()

In [None]:
(df_madagascar == series_min).head()

In [None]:
df_madagascar[df_madagascar == series_min]  # hmmm, NaNs

In [None]:
# Extra work since series_min is not a single number and df_madagascar is a fully fleged DataFrame
df_madagascar[df_madagascar.loc[:, 'Madagascar'] == series_min.values.mean()]

In [None]:
# What is the average "Total" percentage over the past 5 years?
df_five_year = df_totals.loc[:, '2015':'2011']
df_five_year.head()

In [None]:
df_five_year.mean(axis=1).head()

In [None]:
df_five_year.fillna(df_five_year.mean(axis=1)).head()  # hmm, that didn't fill the NaN

In [None]:
df_five_year.fillna(df_five_year.mean(axis=1), axis=1).head()  # Not NotImplementedError!

In [None]:
df_five_year.T.fillna(df_five_year.T.mean()).T
# Still some NaNs but that's because there was no value in the row. 
# Let's remove those.

In [None]:
df_cleaned = df_five_year.T.fillna(df_five_year.T.mean()).T.dropna()
df_cleaned.head()

In [None]:
df_cleaned.mean()  # Looks like we are trending up.

In [None]:
%matplotlib inline

df_cleaned.sort_index(axis=1).mean().plot()