In [36]:
### automating pull from git
import git
import os
from glob import glob
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import shutil

### Data Source: https://github.com/CSSEGISandData/COVID-19

John Hopkins COVID Research github repo

In [27]:
def gitpull(git_dir, clone_dir, git_repo):
    repo = git_dir.split('/')[-1]
    if repo not in os.listdir(clone_dir):
        clone_path = os.path.abspath(clone_dir)
        print (f"Cloning {git_repo} repo")
        print (f"at the path {clone_path}")
        print(git.Git(clone_path).clone(git_repo))
    else:
        g = git.cmd.Git(git_dir)
        print (g.pull())
git_dir = '../COVID-19'
git_repo = 'https://github.com/CSSEGISandData/COVID-19.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Already up to date.


### All data files

In [7]:
data_path = '../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'

all_paths = glob(data_path)

In [8]:
li = []

for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [9]:
col_pairs = [
    ('Province/State', 'Province_State'),
    ('Country/Region', 'Country_Region'), 
    ('Last Update', 'Last_Update'),
    ('Latitude', 'Lat'),
    ('Longitude', 'Long_')
]

### Data Cleaning

In [10]:
### correct the columns in data frame

def replace_nan_col_a(df, col_a, col_b):
    df[col_a] = np.where(pd.isnull(df[col_a]), df[col_b], df[col_a])
    df.drop(col_b, axis= 1, inplace= True)

In [11]:
for col_pair in col_pairs:
    replace_nan_col_a(frame, *col_pair)

### Data Types

In [12]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43358 entries, 0 to 43357
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  38226 non-null  object 
 1   Country/Region  43358 non-null  object 
 2   Last Update     43358 non-null  object 
 3   Confirmed       43339 non-null  float64
 4   Deaths          42917 non-null  float64
 5   Recovered       42970 non-null  float64
 6   Latitude        40524 non-null  float64
 7   Longitude       40524 non-null  float64
 8   FIPS            32638 non-null  float64
 9   Admin2          32938 non-null  object 
 10  Active          35741 non-null  float64
 11  Combined_Key    35741 non-null  object 
dtypes: float64(7), object(5)
memory usage: 4.0+ MB


In [13]:
frame['Last Update'] = pd.to_datetime(frame['Last Update'])

In [14]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43358 entries, 0 to 43357
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  38226 non-null  object        
 1   Country/Region  43358 non-null  object        
 2   Last Update     43358 non-null  datetime64[ns]
 3   Confirmed       43339 non-null  float64       
 4   Deaths          42917 non-null  float64       
 5   Recovered       42970 non-null  float64       
 6   Latitude        40524 non-null  float64       
 7   Longitude       40524 non-null  float64       
 8   FIPS            32638 non-null  float64       
 9   Admin2          32938 non-null  object        
 10  Active          35741 non-null  float64       
 11  Combined_Key    35741 non-null  object        
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 4.0+ MB


### Duplicates

In [15]:
### datetime, country, province keeping last as that has longitude and latitude
df = frame.drop_duplicates(['Country/Region', 'Province/State', 'Last Update', 'Admin2'], keep='last')

In [38]:
if 'data' not in os.listdir('./'):
    os.mkdir('data')
    
if 'master_data' not in os.listdir('data/'):
    os.mkdir('data/master_data')
    
df.to_csv('data/master_data/john_hopkins_research.csv')

## Data Source: https://github.com/nychealth/coronavirus-data

NYC health Coronavirus

In [28]:
git_dir = '../coronavirus-data'
git_repo = 'https://github.com/nychealth/coronavirus-data.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Cloning https://github.com/nychealth/coronavirus-data.git repo
at the path C:\Users\sahil\Workspace\Columbia\Spring 2020\Courses\Business Analytics\Project



### list all the files in the dataset

In [29]:
os.listdir(git_dir)

['.DS_Store',
 '.git',
 'boro.csv',
 'by-age.csv',
 'by-sex.csv',
 'case-hosp-death.csv',
 'README.md',
 'summary.csv',
 'testing.csv',
 'tests-by-zcta.csv']

In [45]:
### create new_york directory in master data
if 'new_york' not in os.listdir('data/master_data'):
    os.mkdir('7/master_data/new_york')

In [49]:
all_csvs = glob(git_dir + '/*.csv')
dest_dir = 'data/master_data/new_york/'
for f in all_csvs:
    if '\\' in f:
        file_name = f.split('\\')[-1]
    else:
        file_name = f.split('/')[-1]
    file_path = dest_dir + file_name
    shutil.copyfile(f, file_path)

### Country Level COVID Data

In [51]:
git_dir = '../covid-19-data'
git_repo = 'https://github.com/owid/covid-19-data.git'
clone_dir = '../'

gitpull(git_dir, clone_dir, git_repo)

Updating c3bcf2d..cc667f1
Fast-forward
 public/data/ecdc/COVID-2019 - ECDC (2020).csv | 17790 ++++++++++++------------
 scripts/ecdc.py                               |     2 +-
 scripts/shared.py                             |     5 +
 3 files changed, 8901 insertions(+), 8896 deletions(-)


In [54]:
if 'country_level' not in os.listdir('data/master_data/'):
    os.mkdir('data/master_data/country_level')

In [55]:
for f in glob('../covid-19-data/public/data/ecdc/*.csv'):
    if '\\' in f:
        file_name = f.split('\\')[-1]
    else:
        file_name = f.split('/')[-1]
    file_name = 'data/master_data/country_level/' + file_name
    shutil.copyfile(f, file_name)