# Inspect Data in CSV Files

In this Jupiter Notebook, we will try to understand the different columns in the 8 CSV files, located in the `covid-data` folder.

In [18]:
import pandas as pd

In [19]:
countries_aggregated = pd.read_csv("covid-data/countries-aggregated.csv", parse_dates=['Date'])
key_countries_pivoted = pd.read_csv("covid-data/key-countries-pivoted.csv", parse_dates=['Date'])
reference = pd.read_csv("covid-data/reference.csv")
time_series_19_covid_combined = pd.read_csv("covid-data/time-series-19-covid-combined.csv", parse_dates=['Date'])
us_confirmed = pd.read_csv("covid-data/us-confirmed.csv", parse_dates=['Date'])
us_deaths = pd.read_csv("covid-data/us-deaths.csv", parse_dates=['Date'])
us_simplified = pd.read_csv("covid-data/us-simplified.csv", parse_dates=['Date'])
worldwide_aggregate = pd.read_csv("covid-data/worldwide-aggregate.csv", parse_dates=['Date'])

In [20]:
# Countries Aggregated
countries_aggregated.info()
countries_aggregated.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161568 entries, 0 to 161567
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       161568 non-null  datetime64[ns]
 1   Country    161568 non-null  object        
 2   Confirmed  161568 non-null  int64         
 3   Recovered  161568 non-null  int64         
 4   Deaths     161568 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 6.2+ MB


Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0,0
1,2020-01-23,Afghanistan,0,0,0


In [None]:
import csv
import uuid

# Generate unique uuid for countries_aggregated

input_file = "covid-data/countries-aggregated.csv"
output_file = "covid-data/countries-aggregated-with-uuid.csv"

with open(input_file, 'r') as csvfile, open(output_file, 'w', newline='') as outfile:
    reader = csv.DictReader(csvfile)
    fieldnames = ['id'] + reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for row in reader:
        row['id'] = str(uuid.uuid4())
        writer.writerow(row)

In [21]:
# Key Countries Pivoted
key_countries_pivoted.info()
key_countries_pivoted.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            816 non-null    datetime64[ns]
 1   China           816 non-null    int64         
 2   US              816 non-null    int64         
 3   United_Kingdom  816 non-null    int64         
 4   Italy           816 non-null    int64         
 5   France          816 non-null    int64         
 6   Germany         816 non-null    int64         
 7   Spain           816 non-null    int64         
 8   Iran            816 non-null    int64         
dtypes: datetime64[ns](1), int64(8)
memory usage: 57.5 KB


Unnamed: 0,Date,China,US,United_Kingdom,Italy,France,Germany,Spain,Iran
0,2020-01-22,548,1,0,0,0,0,0,0
1,2020-01-23,643,1,0,0,0,0,0,0


In [22]:
# Reference
reference.info()
reference.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4316 entries, 0 to 4315
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   UID             4316 non-null   int64  
 1   iso2            4311 non-null   object 
 2   iso3            4312 non-null   object 
 3   code3           4312 non-null   float64
 4   FIPS            3384 non-null   float64
 5   Admin2          3343 non-null   object 
 6   Province_State  4117 non-null   object 
 7   Country_Region  4316 non-null   object 
 8   Lat             4168 non-null   float64
 9   Long_           4168 non-null   float64
 10  Combined_Key    4316 non-null   object 
 11  Population      4165 non-null   float64
dtypes: float64(5), int64(1), object(6)
memory usage: 404.8+ KB


Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0
1,8,AL,ALB,8.0,,,,Albania,41.1533,20.1683,Albania,2877800.0


In [23]:
# Time Series 19 Covid Combined
time_series_19_covid_combined.info()
time_series_19_covid_combined.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231744 entries, 0 to 231743
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Date            231744 non-null  datetime64[ns]
 1   Country/Region  231744 non-null  object        
 2   Province/State  72624 non-null   object        
 3   Confirmed       231744 non-null  int64         
 4   Recovered       218688 non-null  float64       
 5   Deaths          231744 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 10.6+ MB


Unnamed: 0,Date,Country/Region,Province/State,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,,0,0.0,0
1,2020-01-23,Afghanistan,,0,0.0,0


In [24]:
# US Confirmed
us_confirmed.info()
us_confirmed.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2727072 entries, 0 to 2727071
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Admin2          object        
 1   Date            datetime64[ns]
 2   Case            int64         
 3   Country/Region  object        
 4   Province/State  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 104.0+ MB


Unnamed: 0,Admin2,Date,Case,Country/Region,Province/State
0,Autauga,2020-01-22,0,US,Alabama
1,Autauga,2020-01-23,0,US,Alabama


In [25]:
# US Deaths
us_deaths.info()
us_deaths.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2727072 entries, 0 to 2727071
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Admin2          object        
 1   Date            datetime64[ns]
 2   Case            int64         
 3   Country/Region  object        
 4   Province/State  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 104.0+ MB


Unnamed: 0,Admin2,Date,Case,Country/Region,Province/State
0,Autauga,2020-01-22,0,US,Alabama
1,Autauga,2020-01-23,0,US,Alabama


In [26]:
# US Simplified
us_simplified.info()
us_simplified.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2727072 entries, 0 to 2727071
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Date            datetime64[ns]
 1   Admin2          object        
 2   Province/State  object        
 3   Confirmed       int64         
 4   Deaths          int64         
 5   Country/Region  object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 124.8+ MB


Unnamed: 0,Date,Admin2,Province/State,Confirmed,Deaths,Country/Region
0,2020-01-22,Autauga,Alabama,0,0,US
1,2020-01-23,Autauga,Alabama,0,0,US


In [27]:
# Worldwide Aggregate
worldwide_aggregate.info()
worldwide_aggregate.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           816 non-null    datetime64[ns]
 1   Confirmed      816 non-null    int64         
 2   Recovered      816 non-null    int64         
 3   Deaths         816 non-null    int64         
 4   Increase rate  815 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 32.0 KB


Unnamed: 0,Date,Confirmed,Recovered,Deaths,Increase rate
0,2020-01-22,557,30,17,
1,2020-01-23,657,32,18,17.953321
