In [1]:
### automating pull from git
import git
import os
from glob import glob
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import shutil

### Data Source: https://github.com/CSSEGISandData/COVID-19

John Hopkins COVID Research github repo

In [2]:
def gitpull(git_dir, clone_dir, git_repo):
    repo = git_dir.split('/')[-1]
    if repo not in os.listdir(clone_dir):
        clone_path = os.path.abspath(clone_dir)
        print (f"Cloning {git_repo} repo")
        print (f"at the path {clone_path}")
        print(git.Git(clone_path).clone(git_repo))
    else:
        g = git.cmd.Git(git_dir)
        print (g.pull())
git_dir = '../COVID-19'
git_repo = 'https://github.com/CSSEGISandData/COVID-19.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating 7b8a6cb4..865c933c
Fast-forward
 csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv   |    3 +
 .../csse_covid_19_daily_reports/04-03-2020.csv     | 2626 ++++++++
 .../csse_covid_19_daily_reports/04-04-2020.csv     | 2680 ++++++++
 .../csse_covid_19_daily_reports/04-05-2020.csv     | 2765 +++++++++
 .../time_series_covid19_confirmed_US.csv           | 6508 ++++++++++----------
 .../time_series_covid19_confirmed_global.csv       |  522 +-
 .../time_series_covid19_deaths_US.csv              | 6508 ++++++++++----------
 .../time_series_covid19_deaths_global.csv          |  522 +-
 .../time_series_covid19_recovered_global.csv       |  494 +-
 .../who_covid_19_sit_rep_time_series.csv           |  489 +-
 10 files changed, 15617 insertions(+), 7500 deletions(-)
 create mode 100644 csse_covid_19_data/csse_covid_19_daily_reports/04-03-2020.csv
 create mode 100644 csse_covid_19_data/csse_covid_19_daily_reports/04-04-2020.csv
 create mode 100644 csse_covid_19_data/csse_covid_19_daily_repo

### All data files

In [3]:
data_path = '../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'

all_paths = glob(data_path)

In [4]:
li = []

for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [5]:
col_pairs = [
    ('Province/State', 'Province_State'),
    ('Country/Region', 'Country_Region'), 
    ('Last Update', 'Last_Update'),
    ('Latitude', 'Lat'),
    ('Longitude', 'Long_')
]

### Data Cleaning

In [6]:
### correct the columns in data frame

def replace_nan_col_a(df, col_a, col_b):
    df[col_a] = np.where(pd.isnull(df[col_a]), df[col_b], df[col_a])
    df.drop(col_b, axis= 1, inplace= True)

In [7]:
for col_pair in col_pairs:
    replace_nan_col_a(frame, *col_pair)

### Data Types

In [8]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53995 entries, 0 to 53994
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  48153 non-null  object 
 1   Country/Region  53995 non-null  object 
 2   Last Update     53995 non-null  object 
 3   Confirmed       53976 non-null  float64
 4   Deaths          53554 non-null  float64
 5   Recovered       53607 non-null  float64
 6   Latitude        50948 non-null  float64
 7   Longitude       50948 non-null  float64
 8   FIPS            42154 non-null  float64
 9   Admin2          42514 non-null  object 
 10  Active          46378 non-null  float64
 11  Combined_Key    46378 non-null  object 
dtypes: float64(7), object(5)
memory usage: 4.9+ MB


In [9]:
frame['Last Update'] = pd.to_datetime(frame['Last Update'])

In [10]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53995 entries, 0 to 53994
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  48153 non-null  object        
 1   Country/Region  53995 non-null  object        
 2   Last Update     53995 non-null  datetime64[ns]
 3   Confirmed       53976 non-null  float64       
 4   Deaths          53554 non-null  float64       
 5   Recovered       53607 non-null  float64       
 6   Latitude        50948 non-null  float64       
 7   Longitude       50948 non-null  float64       
 8   FIPS            42154 non-null  float64       
 9   Admin2          42514 non-null  object        
 10  Active          46378 non-null  float64       
 11  Combined_Key    46378 non-null  object        
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 4.9+ MB


### Duplicates

In [11]:
### datetime, country, province keeping last as that has longitude and latitude
df = frame.drop_duplicates(['Country/Region', 'Province/State', 'Last Update', 'Admin2'], keep='last')

In [12]:
if 'data' not in os.listdir('./'):
    os.mkdir('data')
    
if 'master_data' not in os.listdir('data/'):
    os.mkdir('data/master_data')
    
df.to_csv('data/master_data/john_hopkins_research.csv')

## Data Source: https://github.com/nychealth/coronavirus-data

NYC health Coronavirus

In [13]:
git_dir = '../coronavirus-data'
git_repo = 'https://github.com/nychealth/coronavirus-data.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating 10efdd3..98a7fd1
Fast-forward
 boro.csv            |  12 +-
 by-age.csv          |  12 +-
 by-sex.csv          |   6 +-
 case-hosp-death.csv |  55 ++++----
 summary.csv         |   8 +-
 testing.csv         |  41 ------
 tests-by-zcta.csv   | 358 ++++++++++++++++++++++++++--------------------------
 7 files changed, 227 insertions(+), 265 deletions(-)
 delete mode 100644 testing.csv


### list all the files in the dataset

In [14]:
os.listdir(git_dir)

['.git',
 'boro.csv',
 'by-age.csv',
 'by-sex.csv',
 'case-hosp-death.csv',
 'README.md',
 'summary.csv',
 'tests-by-zcta.csv']

In [15]:
### create new_york directory in master data
if 'new_york' not in os.listdir('data/master_data'):
    os.mkdir('7/master_data/new_york')

In [16]:
all_csvs = glob(git_dir + '/*.csv')
dest_dir = 'data/master_data/new_york/'
for f in all_csvs:
    if '\\' in f:
        file_name = f.split('\\')[-1]
    else:
        file_name = f.split('/')[-1]
    file_path = dest_dir + file_name
    shutil.copyfile(f, file_path)

### Country Level COVID Data

In [17]:
git_dir = '../covid-19-data'
git_repo = 'https://github.com/owid/covid-19-data.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating 42e40eb..f9606a9
Fast-forward
 input/ecdc/releases/2020-04-04.xlsx           |   Bin 0 -> 378556 bytes
 input/ecdc/releases/2020-04-05.xlsx           |   Bin 0 -> 387709 bytes
 input/ecdc/releases/2020-04-06.xlsx           |   Bin 0 -> 396128 bytes
 input/ecdc/releases/latest.csv                |   633 +-
 input/owid/eu_countries.csv                   |    29 +
 input/owid/wb_income_groups.csv               |   219 +
 public/data/ecdc/COVID-2019 - ECDC (2020).csv | 21022 +++++++++++++-----------
 public/data/ecdc/full_data.csv                |   614 +-
 public/data/ecdc/new_cases.csv                |   195 +-
 public/data/ecdc/new_cases_per_million.csv    |   195 +-
 public/data/ecdc/new_deaths.csv               |   195 +-
 public/data/ecdc/new_deaths_per_million.csv   |   195 +-
 public/data/ecdc/total_cases.csv              |   195 +-
 public/data/ecdc/total_cases_per_million.csv  |   195 +-
 public/data/ecdc/total_deaths.csv             |   195 +-
 public/data/ecdc/total_de

In [18]:
if 'country_level' not in os.listdir('data/master_data/'):
    os.mkdir('data/master_data/country_level')

In [19]:
for f in glob('../covid-19-data/public/data/ecdc/*.csv'):
    if '\\' in f:
        file_name = f.split('\\')[-1]
    else:
        file_name = f.split('/')[-1]
    file_name = 'data/master_data/country_level/' + file_name
    shutil.copyfile(f, file_name)