In [1]:
# Import Dependencies 
import pandas as pd
import numpy as np

In [2]:
# Read CSV File
owid_data = pd.read_csv("Resources/world_data.csv", low_memory=False)
owid_data.head()

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths,weekly_cases,weekly_deaths,biweekly_cases,biweekly_deaths
0,2019-12-31,Afghanistan,0.0,0.0,,,,,,
1,2020-01-01,Afghanistan,0.0,0.0,,,,,,
2,2020-01-02,Afghanistan,0.0,0.0,,,,,,
3,2020-01-03,Afghanistan,0.0,0.0,,,,,,
4,2020-01-04,Afghanistan,0.0,0.0,,,,,,


In [3]:
# Refine the DataFrame to only contain columns needed for analyses.
owid_data2 = owid_data[["date", "location", "new_cases", "new_deaths"
                        , "total_cases", "total_deaths"]]
owid_data2.head()

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
0,2019-12-31,Afghanistan,0.0,0.0,,
1,2020-01-01,Afghanistan,0.0,0.0,,
2,2020-01-02,Afghanistan,0.0,0.0,,
3,2020-01-03,Afghanistan,0.0,0.0,,
4,2020-01-04,Afghanistan,0.0,0.0,,


In [4]:
# Generate a list containing only countries of interest.
countries = ["China", "Japan", "New Zealand", "United Kingdom", "Italy", "Sweden", "United States", "Brazil"]

# Create a variable to join all elements of our list with a passed delimiter. 
magic = "|".join(countries)

# Define the final DataFrame by using the refined DF above...
#...along with the variable for the joint countries list. 
owid_df = owid_data2[owid_data2["location"].str.contains(magic,regex=True)]
owid_df

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
6779,2019-12-31,Brazil,0.0,0.0,,
6780,2020-01-01,Brazil,0.0,0.0,,
6781,2020-01-02,Brazil,0.0,0.0,,
6782,2020-01-03,Brazil,0.0,0.0,,
6783,2020-01-04,Brazil,0.0,0.0,,
...,...,...,...,...,...,...
50148,2020-10-24,United States Virgin Islands,3.0,0.0,1346.0,21.0
50149,2020-10-25,United States Virgin Islands,2.0,0.0,1348.0,21.0
50150,2020-10-26,United States Virgin Islands,0.0,0.0,1348.0,21.0
50151,2020-10-27,United States Virgin Islands,0.0,0.0,1348.0,21.0


In [5]:
# Drop the extra country included due to a matching name
owid_clean_df = owid_df.drop(owid_df[owid_df["location"]=="United States Virgin Islands"].index)
owid_clean_df

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
6779,2019-12-31,Brazil,0.0,0.0,,
6780,2020-01-01,Brazil,0.0,0.0,,
6781,2020-01-02,Brazil,0.0,0.0,,
6782,2020-01-03,Brazil,0.0,0.0,,
6783,2020-01-04,Brazil,0.0,0.0,,
...,...,...,...,...,...,...
49929,2020-10-24,United States,85329.0,953.0,8493669.0,223995.0
49930,2020-10-25,United States,83056.0,904.0,8576725.0,224899.0
49931,2020-10-26,United States,59440.0,331.0,8636165.0,225230.0
49932,2020-10-27,United States,68359.0,505.0,8704524.0,225735.0


In [8]:
# Replace all missing values with 0.0  ????????????? Cell runs but NaN's remain!!!
owid_clean_df.replace(np.nan, 0.0)
owid_clean_df

Unnamed: 0,date,location,new_cases,new_deaths,total_cases,total_deaths
6779,2019-12-31,Brazil,0.0,0.0,,
6780,2020-01-01,Brazil,0.0,0.0,,
6781,2020-01-02,Brazil,0.0,0.0,,
6782,2020-01-03,Brazil,0.0,0.0,,
6783,2020-01-04,Brazil,0.0,0.0,,
...,...,...,...,...,...,...
49929,2020-10-24,United States,85329.0,953.0,8493669.0,223995.0
49930,2020-10-25,United States,83056.0,904.0,8576725.0,224899.0
49931,2020-10-26,United States,59440.0,331.0,8636165.0,225230.0
49932,2020-10-27,United States,68359.0,505.0,8704524.0,225735.0


In [7]:
# Save clean DataFrame to a CSV file
owid_clean_df.to_csv("output_data/owid_clean.csv", index=False)