In [67]:
### automating pull from git
import git
import os
from glob import glob
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import shutil

### Data Source: https://github.com/CSSEGISandData/COVID-19

John Hopkins COVID Research github repo

In [68]:
def gitpull(git_dir, clone_dir, git_repo):
    repo = git_dir.split('/')[-1]
    if repo not in os.listdir(clone_dir):
        clone_path = os.path.abspath(clone_dir)
        print (f"Cloning {git_repo} repo")
        print (f"at the path {clone_path}")
        print(git.Git(clone_path).clone(git_repo))
    else:
        g = git.cmd.Git(git_dir)
        print (g.pull())
git_dir = '../COVID-19'
git_repo = 'https://github.com/CSSEGISandData/COVID-19.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating 6af1fed7..7b8a6cb4
Fast-forward
 csse_covid_19_data/README.md                     |  24 +
 csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv | 792 ++++++++++++-----------
 2 files changed, 421 insertions(+), 395 deletions(-)


### All data files

In [69]:
data_path = '../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'

all_paths = glob(data_path)

In [70]:
li = []

for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [71]:
col_pairs = [
    ('Province/State', 'Province_State'),
    ('Country/Region', 'Country_Region'), 
    ('Last Update', 'Last_Update'),
    ('Latitude', 'Lat'),
    ('Longitude', 'Long_')
]

### Data Cleaning

In [72]:
### correct the columns in data frame

def replace_nan_col_a(df, col_a, col_b):
    df[col_a] = np.where(pd.isnull(df[col_a]), df[col_b], df[col_a])
    df.drop(col_b, axis= 1, inplace= True)

In [73]:
for col_pair in col_pairs:
    replace_nan_col_a(frame, *col_pair)

### Data Types

In [74]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45927 entries, 0 to 45926
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  40618 non-null  object 
 1   Country/Region  45927 non-null  object 
 2   Last Update     45927 non-null  object 
 3   Confirmed       45908 non-null  float64
 4   Deaths          45486 non-null  float64
 5   Recovered       45539 non-null  float64
 6   Latitude        43039 non-null  float64
 7   Longitude       43039 non-null  float64
 8   FIPS            34926 non-null  float64
 9   Admin2          35243 non-null  object 
 10  Active          38310 non-null  float64
 11  Combined_Key    38310 non-null  object 
dtypes: float64(7), object(5)
memory usage: 4.2+ MB


In [75]:
frame['Last Update'] = pd.to_datetime(frame['Last Update'])

In [76]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45927 entries, 0 to 45926
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  40618 non-null  object        
 1   Country/Region  45927 non-null  object        
 2   Last Update     45927 non-null  datetime64[ns]
 3   Confirmed       45908 non-null  float64       
 4   Deaths          45486 non-null  float64       
 5   Recovered       45539 non-null  float64       
 6   Latitude        43039 non-null  float64       
 7   Longitude       43039 non-null  float64       
 8   FIPS            34926 non-null  float64       
 9   Admin2          35243 non-null  object        
 10  Active          38310 non-null  float64       
 11  Combined_Key    38310 non-null  object        
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 4.2+ MB


### Duplicates

In [77]:
### datetime, country, province keeping last as that has longitude and latitude
df = frame.drop_duplicates(['Country/Region', 'Province/State', 'Last Update', 'Admin2'], keep='last')

In [78]:
if 'data' not in os.listdir('./'):
    os.mkdir('data')
    
if 'master_data' not in os.listdir('data/'):
    os.mkdir('data/master_data')
    
df.to_csv('data/master_data/john_hopkins_research.csv')

## Data Source: https://github.com/nychealth/coronavirus-data

NYC health Coronavirus

In [79]:
git_dir = '../coronavirus-data'
git_repo = 'https://github.com/nychealth/coronavirus-data.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating dd2010c..10efdd3
Fast-forward
 .DS_Store           | Bin 6148 -> 0 bytes
 boro.csv            |  12 ++++++------
 by-age.csv          |  12 ++++++------
 by-sex.csv          |   6 +++---
 case-hosp-death.csv |  49 +++++++++++++++++++++++++------------------------
 summary.csv         |   8 ++++----
 6 files changed, 44 insertions(+), 43 deletions(-)
 delete mode 100644 .DS_Store


### list all the files in the dataset

In [80]:
os.listdir(git_dir)

['.git',
 'boro.csv',
 'by-age.csv',
 'by-sex.csv',
 'case-hosp-death.csv',
 'README.md',
 'summary.csv',
 'testing.csv',
 'tests-by-zcta.csv']

In [81]:
### create new_york directory in master data
if 'new_york' not in os.listdir('data/master_data'):
    os.mkdir('7/master_data/new_york')

In [82]:
all_csvs = glob(git_dir + '/*.csv')
dest_dir = 'data/master_data/new_york/'
for f in all_csvs:
    if '\\' in f:
        file_name = f.split('\\')[-1]
    else:
        file_name = f.split('/')[-1]
    file_path = dest_dir + file_name
    shutil.copyfile(f, file_path)

### Country Level COVID Data

In [83]:
git_dir = '../covid-19-data'
git_repo = 'https://github.com/owid/covid-19-data.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating cc667f1..42e40eb
Fast-forward
 input/ecdc/ecdc_country_standardized.csv      |   411 +-
 input/ecdc/releases/2020-04-03.xlsx           |   Bin 0 -> 369620 bytes
 input/ecdc/releases/latest.csv                |   218 +-
 input/owid/continents.csv                     |   286 +
 public/data/ecdc/COVID-2019 - ECDC (2020).csv | 18843 +++++++++++++-----------
 public/data/ecdc/full_data.csv                |   215 +-
 public/data/ecdc/locations.csv                |   409 +-
 public/data/ecdc/new_cases.csv                |   191 +-
 public/data/ecdc/new_cases_per_million.csv    |   191 +-
 public/data/ecdc/new_deaths.csv               |   191 +-
 public/data/ecdc/new_deaths_per_million.csv   |   191 +-
 public/data/ecdc/total_cases.csv              |   191 +-
 public/data/ecdc/total_cases_per_million.csv  |   191 +-
 public/data/ecdc/total_deaths.csv             |   191 +-
 public/data/ecdc/total_deaths_per_million.csv |   191 +-
 scripts/ecdc.py                               |    11 

In [84]:
if 'country_level' not in os.listdir('data/master_data/'):
    os.mkdir('data/master_data/country_level')

In [85]:
for f in glob('../covid-19-data/public/data/ecdc/*.csv'):
    if '\\' in f:
        file_name = f.split('\\')[-1]
    else:
        file_name = f.split('/')[-1]
    file_name = 'data/master_data/country_level/' + file_name
    shutil.copyfile(f, file_name)