# Chapter 4 - Missing Values

In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [184]:
covidtotals = pd.read_csv("data/covidtotalswithmissings.csv")
covidtotals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 12 columns):
iso_code           210 non-null object
lastdate           210 non-null object
location           210 non-null object
total_cases        210 non-null int64
total_deaths       210 non-null int64
total_cases_pm     209 non-null float64
total_deaths_pm    209 non-null float64
population         210 non-null float64
pop_density        198 non-null float64
median_age         186 non-null float64
gdp_per_capita     182 non-null float64
hosp_beds          164 non-null float64
dtypes: float64(7), int64(2), object(3)
memory usage: 19.8+ KB


In [185]:
totvars = ['location', 'total_cases', 'total_deaths',
       'total_cases_pm', 'total_deaths_pm']

demovars = ['population', 'pop_density',
       'median_age', 'gdp_per_capita', 'hosp_beds']
 

## Check missing values (isnull) - demovars

In [186]:
#by column
covidtotals[demovars].isnull().sum()

population         0
pop_density       12
median_age        24
gdp_per_capita    28
hosp_beds         46
dtype: int64

In [187]:
#by row
demovarsmisscnt = covidtotals[demovars].isnull().sum(axis=1)
demovarsmisscnt.value_counts()

0    156
1     24
2     12
3     10
4      8
dtype: int64

In [188]:
#actual rows with data missing (here, more than 3 NaN for each row)
covidtotals.loc[demovarsmisscnt>=3,["location"]+demovars].head().T

Unnamed: 0,3,5,24,28,64
location,Andorra,Anguilla,Bonaire Sint Eustatius and Saba,British Virgin Islands,Faeroe Islands
population,77265,15002,26221,30237,48865
pop_density,163.755,,,207.973,35.308
median_age,,,,,
gdp_per_capita,,,,,
hosp_beds,,,,,


## Check missing values (isnull) - totvars

In [189]:
covidtotals[totvars].isnull().sum()

location           0
total_cases        0
total_deaths       0
total_cases_pm     1
total_deaths_pm    1
dtype: int64

In [190]:
totvarsmisscnt = covidtotals[totvars].isnull().sum(axis=1)
totvarsmisscnt.value_counts()

0    209
2      1
dtype: int64

In [191]:
covidtotals.loc[totvarsmisscnt>0].T

Unnamed: 0,87
iso_code,HKG
lastdate,2020-05-26
location,Hong Kong
total_cases,0
total_deaths,0
total_cases_pm,
total_deaths_pm,
population,7.49699e+06
pop_density,7039.71
median_age,44.8


In [198]:
#fill na
covidtotals["total_cases_pm"].fillna(covidtotals["total_cases"]/(covidtotals["population"]/1000000), inplace=True)
covidtotals["total_deaths_pm"].fillna(covidtotals["total_deaths"]/(covidtotals["population"]/1000000), inplace=True)

In [200]:
covidtotals[covidtotals["iso_code"]=="HKG"].T

Unnamed: 0,87
iso_code,HKG
lastdate,2020-05-26
location,Hong Kong
total_cases,0
total_deaths,0
total_cases_pm,0
total_deaths_pm,0
population,7.49699e+06
pop_density,7039.71
median_age,44.8
