## Import the libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load the data

In [6]:
df = pd.read_csv("raw_data/covidtotalswithmissings.csv")


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   iso_code         210 non-null    object 
 1   lastdate         210 non-null    object 
 2   location         210 non-null    object 
 3   total_cases      210 non-null    int64  
 4   total_deaths     210 non-null    int64  
 5   total_cases_pm   209 non-null    float64
 6   total_deaths_pm  209 non-null    float64
 7   population       210 non-null    float64
 8   pop_density      198 non-null    float64
 9   median_age       186 non-null    float64
 10  gdp_per_capita   182 non-null    float64
 11  hosp_beds        164 non-null    float64
dtypes: float64(7), int64(2), object(3)
memory usage: 19.8+ KB


## Create two lists for total related columns and demographic related columns

In [11]:
total_cols = ['location', 'total_cases', 'total_deaths', 'total_cases_pm', 'total_deaths_pm']

dmg_cols = ['population', 'pop_density', 'median_age', 'gdp_per_capita', 'hosp_beds']


## Find missing values in demographic data

In [15]:
df[dmg_cols].isnull().sum()

population         0
pop_density       12
median_age        24
gdp_per_capita    28
hosp_beds         46
dtype: int64

## Set the axis=1 to get the number of demographic variables that are missing for each country (Missing values accross rows)

In [30]:
dmg_cols_miss_count = df[dmg_cols].isnull().sum(axis=1)\

dmg_cols_miss_count

0      0
1      0
2      0
3      3
4      1
      ..
205    0
206    3
207    0
208    0
209    0
Length: 210, dtype: int64

## Get the value counts for the dmg columns

In [25]:
dmg_cols_miss_count.value_counts()

0    156
1     24
2     12
3     10
4      8
Name: count, dtype: int64

## List the countries with 3 or more missing values for the demographic columns

In [28]:
df.loc[dmg_cols_miss_count >=3, ['location'] + dmg_cols].head(5)

Unnamed: 0,location,population,pop_density,median_age,gdp_per_capita,hosp_beds
3,Andorra,77265.0,163.755,,,
5,Anguilla,15002.0,,,,
24,Bonaire Sint Eustatius and Saba,26221.0,,,,
28,British Virgin Islands,30237.0,207.973,,,
64,Faeroe Islands,48865.0,35.308,,,


## Check Covid Case data (total_cols) for missing values

In [34]:
df[total_cols].isnull().sum(axis=0)

location           0
total_cases        0
total_deaths       0
total_cases_pm     1
total_deaths_pm    1
dtype: int64

In [36]:
total_cols_miss_count = df[total_cols].isnull().sum(axis =1)

total_cols_miss_count

0      0
1      0
2      0
3      0
4      0
      ..
205    0
206    0
207    0
208    0
209    0
Length: 210, dtype: int64

In [38]:
total_cols_miss_count.value_counts()

0    209
2      1
Name: count, dtype: int64

In [41]:
df.loc[total_cols_miss_count>0]

Unnamed: 0,iso_code,lastdate,location,total_cases,total_deaths,total_cases_pm,total_deaths_pm,population,pop_density,median_age,gdp_per_capita,hosp_beds
87,HKG,2020-05-26,Hong Kong,0,0,,,7496988.0,7039.714,44.8,56054.92,


## Fixing the missing values with fillna() method

In [44]:
df.total_cases_pm.fillna(df.total_cases/(df.population/1000000), inplace = True)

df.total_deaths_pm.fillna(df.total_deaths/(df.population/1000000), inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.total_cases_pm.fillna(df.total_cases/(df.population/1000000), inplace = True)


In [48]:
df[total_cols].isnull().sum(axis =0)

location           0
total_cases        0
total_deaths       0
total_cases_pm     0
total_deaths_pm    0
dtype: int64