In [1]:
import os, re
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
all_files = [
    'lifeExpectancyAtBirth', 'HALElifeExpectancyAtBirth', 'maternalMortalityRatio',
    'birthAttendedBySkilledPersonal', 'infantMortalityRate', 'neonatalMortalityRate',
    'under5MortalityRate', 'incedenceOfMalaria', 'incedenceOfTuberculosis',
    'hepatitusBsurfaceAntigen', 'interventionAgianstNTDs', 'newHivInfections',
    '30-70cancerChdEtc', 'crudeSuicideRates', 'AlcoholSubstanceAbuse',
    'roadTrafficDeaths', 'reproductiveAgeWomen', 'adolescentBirthRate',
    'uhcCoverage', 'dataAvailibilityForUhc', 'population10SDG3.8.2',
    'population25SDG3.8.2', 'airPollutionDeathRate', 'mortalityRateUnsafeWash',
    'mortalityRatePoisoning', 'tobaccoAge15', 'medicalDoctors', 'nursingAndMidwife',
    'Dentists', 'Pharmacists', 'eliminateViolenceAgainstWomen', 'basicDrinkingWaterServices',
    'atLeastBasicSanitizationServices', 'safelySanitization', 'basicHandWashing',
    'cleanFuelAndTech'
]

In [3]:
class DataAnalysis():
    def __init__(self):
        # RE expression for bracketed information within the dataset
        self.rebrackets = re.compile(r'\[[\d.]+\-[\d.]+\]')
    def get_data(self, fname):
        # Utility function that takes file name and returns a dataframe
        dfnew = pd.read_csv(os.path.join('dataset', f"{fname}.csv"))
        return dfnew
    def remove_brackets(self, numstr):
        # Utility function to remove brackets from a value if the value is string
        try:
            re_match = self.rebrackets.search(numstr)
            try:
                numstr = numstr[:re_match.start()].strip()
                return float(numstr)
            except AttributeError:
                return np.nan
        except TypeError:
            return numstr
    def column_info(self, df):
        # Utility function to print column general info
        print(f"TOTAL - {len(df)}")
        for col in df.columns:
            colset = set(df[col])
            colset_len = len(colset)
            print(f"{col}: {colset_len}")
            if pd.api.types.is_numeric_dtype(df[col]):
                print(f" - {df[col].min()} ~ {df[col].median()} ~ {df[col].max()}")
            if colset_len < 10:
                for colset_ele in colset:
                    print(f" - {colset_ele}: {sum(df[col] == colset_ele)}")
    def index_change(self, df, cols=['Period'], values=[None]):
        for col, value in zip(cols, values):
            if value == None:
                df = df[df[col] == df[col].max()]
            else:
                df = df[df[col] == value]
        return df.set_index('Location')

In [4]:
da = DataAnalysis()

In [5]:
df = da.get_data('cleanFuelAndTech')
da.column_info(df)
df.head()

TOTAL - 3610
Location: 191
Indicator: 1
 - Proportion of population with primary reliance on clean fuels and technologies (%): 3610
Period: 19
 - 2000 ~ 2009.0 ~ 2018
First Tooltip: 91
 - 5 ~ 81.0 ~ 95


Unnamed: 0,Location,Indicator,Period,First Tooltip
0,Afghanistan,Proportion of population with primary reliance...,2018,37
1,Afghanistan,Proportion of population with primary reliance...,2017,34
2,Afghanistan,Proportion of population with primary reliance...,2016,31
3,Afghanistan,Proportion of population with primary reliance...,2015,29
4,Afghanistan,Proportion of population with primary reliance...,2014,27


In [6]:
print('lifeExpectancyAtBirth')
da.column_info(da.index_change(da.get_data('lifeExpectancyAtBirth'), ['Period', 'Dim1'], [None, 'Both sexes']))
print('\ncleanFuelAndTech')
da.column_info(da.index_change(da.get_data('cleanFuelAndTech')))
print('\nbasicDrinkingWaterServices')
da.column_info(da.index_change(da.get_data('basicDrinkingWaterServices')))
print('\natLeastBasicSanitizationServices')
da.column_info(da.index_change(da.get_data('atLeastBasicSanitizationServices'), ['Period', 'Dim1'], [None, 'Total']))

lifeExpectancyAtBirth
TOTAL - 183
Period: 1
 - 2019 ~ 2019.0 ~ 2019
 - 2019: 183
Indicator: 1
 - Life expectancy at birth (years): 183
Dim1: 1
 - Both sexes: 183
First Tooltip: 175
 - 50.75 ~ 73.74 ~ 84.26

cleanFuelAndTech
TOTAL - 190
Indicator: 1
 - Proportion of population with primary reliance on clean fuels and technologies (%): 190
Period: 1
 - 2018 ~ 2018.0 ~ 2018
 - 2018: 190
First Tooltip: 54
 - 5 ~ 84.0 ~ 95

basicDrinkingWaterServices
TOTAL - 190
Period: 1
 - 2017 ~ 2017.0 ~ 2017
 - 2017: 190
Indicator: 1
 - Population using at least basic drinking-water services (%): 190
First Tooltip: 144
 - 22.83 ~ 92.265 ~ 100.0

atLeastBasicSanitizationServices
TOTAL - 188
Indicator: 1
 - Population using at least basic sanitation services (%): 188
Period: 1
 - 2017 ~ 2017.0 ~ 2017
 - 2017: 188
Dim1: 1
 - Total: 188
First Tooltip: 167
 - 7.32 ~ 89.065 ~ 100.0
