In [53]:
### automating pull from git
import git
import os
from glob import glob
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [123]:
git_dir = '../COVID-19'
if 'COVID-19' not in os.listdir('../'):
    clone_path = os.path.abspath('../')
    print ("Cloning https://github.com/CSSEGISandData/COVID-19.git repo")
    print (f"at the path {clone_path}")
    git.Git('../').clone('https://github.com/CSSEGISandData/COVID-19.git')
else:
    g = git.cmd.Git(git_dir)
    print (g.pull())

Already up to date.


In [55]:
data_path = '../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv'

all_paths = glob(data_path)

In [56]:
li = []

for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [57]:
col_pairs = [
    ('Province/State', 'Province_State'),
    ('Country/Region', 'Country_Region'), 
    ('Last Update', 'Last_Update'),
    ('Latitude', 'Lat'),
    ('Longitude', 'Long_')
]

### Data Cleaning

In [58]:
### correct the columns in data frame

def replace_nan_col_a(df, col_a, col_b):
    df[col_a] = np.where(pd.isnull(df[col_a]), df[col_b], df[col_a])
    df.drop(col_b, axis= 1, inplace= True)

In [59]:
for col_pair in col_pairs:
    replace_nan_col_a(frame, *col_pair)

### Data Types

In [87]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40873 entries, 0 to 40872
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province/State  35917 non-null  object 
 1   Country/Region  40873 non-null  object 
 2   Last Update     40873 non-null  object 
 3   Confirmed       40854 non-null  float64
 4   Deaths          40432 non-null  float64
 5   Recovered       40485 non-null  float64
 6   Latitude        38040 non-null  float64
 7   Longitude       38040 non-null  float64
 8   FIPS            30467 non-null  float64
 9   Admin2          30715 non-null  object 
 10  Active          33256 non-null  float64
 11  Combined_Key    33256 non-null  object 
dtypes: float64(7), object(5)
memory usage: 3.7+ MB


In [90]:
frame['Last Update'] = pd.to_datetime(frame['Last Update'])

In [124]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40873 entries, 0 to 40872
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  35917 non-null  object        
 1   Country/Region  40873 non-null  object        
 2   Last Update     40873 non-null  datetime64[ns]
 3   Confirmed       40854 non-null  float64       
 4   Deaths          40432 non-null  float64       
 5   Recovered       40485 non-null  float64       
 6   Latitude        38040 non-null  float64       
 7   Longitude       38040 non-null  float64       
 8   FIPS            30467 non-null  float64       
 9   Admin2          30715 non-null  object        
 10  Active          33256 non-null  float64       
 11  Combined_Key    33256 non-null  object        
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 3.7+ MB


### Duplicates

In [112]:
### datetime, country, province keeping last as that has longitude and latitude
df = frame.drop_duplicates(['Country/Region', 'Province/State', 'Last Update', 'Admin2'], keep='last')

In [116]:
if 'data' not in os.listdir('./'):
    os.mkdir('data')
    
df.to_csv('data/master_data.csv')