In [288]:
import pandas as pd
from datetime import datetime


In [289]:
confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv",keep_default_na=False)
deaths = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv",keep_default_na=False)
recovered = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv",keep_default_na=False)

confirmed['Case_Type'] = 'Confirmed'
deaths['Case_Type'] = 'Deaths'
recovered['Case_Type'] = 'Recovered'

key_columns = ['Country/Region','Province/State','Lat','Long','Case_Type']

data = [confirmed, deaths, recovered]
    
list(map( lambda df: len(df.index), data))

[225, 225, 225]

In [290]:
def unpivot(df):
    # unpivot all non-key columns
    melted = df.melt(id_vars=key_columns, var_name='Date', value_name = 'Cases')
    # change our new Date field to Date type
    melted['Date']= pd.to_datetime(melted['Date']) 
    
    return melted

unpivoted_data = list(map(unpivot, data))

unpivoted_data[0]["Date"].describe()

count                   10350
unique                     46
top       2020-02-12 00:00:00
freq                      225
first     2020-01-22 00:00:00
last      2020-03-07 00:00:00
Name: Date, dtype: object

In [291]:
sorted_data = list( map(lambda df: df.sort_values(by=key_columns + ['Date'], ascending=True), unpivoted_data) )

sorted_data[0].tail(5)

Unnamed: 0,Country/Region,Province/State,Lat,Long,Case_Type,Date,Cases
9265,Vietnam,,16.0,108.0,Confirmed,2020-03-03,16
9490,Vietnam,,16.0,108.0,Confirmed,2020-03-04,16
9715,Vietnam,,16.0,108.0,Confirmed,2020-03-05,16
9940,Vietnam,,16.0,108.0,Confirmed,2020-03-06,16
10165,Vietnam,,16.0,108.0,Confirmed,2020-03-07,18


In [293]:
for df in sorted_data:
    df["Difference"] = df["Cases"] - df.groupby( key_columns )["Cases"].shift(1, fill_value = 0) 

concated = pd.concat(sorted_data)

concated.tail(5)



Unnamed: 0,Country/Region,Province/State,Lat,Long,Case_Type,Date,Cases,Difference
9265,Vietnam,,16.0,108.0,Recovered,2020-03-03,16,0
9490,Vietnam,,16.0,108.0,Recovered,2020-03-04,16,0
9715,Vietnam,,16.0,108.0,Recovered,2020-03-05,16,0
9940,Vietnam,,16.0,108.0,Recovered,2020-03-06,16,0
10165,Vietnam,,16.0,108.0,Recovered,2020-03-07,16,0


In [294]:
confirmed = concated[concated["Case_Type"].eq("Confirmed")]
deaths = concated[concated["Case_Type"].eq("Deaths")]
recovered = concated[concated["Case_Type"].eq("Recovered")]

active = confirmed.merge(deaths, validate= "one_to_one", suffixes =["","_d"], on=["Country/Region","Province/State","Date"]) \
         .merge(recovered, validate= "one_to_one", suffixes =["","_r"], on=["Country/Region","Province/State","Date"])

active.head()

Unnamed: 0,Country/Region,Province/State,Lat,Long,Case_Type,Date,Cases,Difference,Lat_d,Long_d,Case_Type_d,Cases_d,Difference_d,Lat_r,Long_r,Case_Type_r,Cases_r,Difference_r
0,Afghanistan,,33.0,65.0,Confirmed,2020-01-22,0,0,33.0,65.0,Deaths,0,0,33.0,65.0,Recovered,0,0
1,Afghanistan,,33.0,65.0,Confirmed,2020-01-23,0,0,33.0,65.0,Deaths,0,0,33.0,65.0,Recovered,0,0
2,Afghanistan,,33.0,65.0,Confirmed,2020-01-24,0,0,33.0,65.0,Deaths,0,0,33.0,65.0,Recovered,0,0
3,Afghanistan,,33.0,65.0,Confirmed,2020-01-25,0,0,33.0,65.0,Deaths,0,0,33.0,65.0,Recovered,0,0
4,Afghanistan,,33.0,65.0,Confirmed,2020-01-26,0,0,33.0,65.0,Deaths,0,0,33.0,65.0,Recovered,0,0


In [295]:
active["Case_Type"] = 'Active'
active["Cases"] = active["Cases"] - active["Cases_r"] - active["Cases_d"]
active["Difference"] = active["Difference"] - active["Difference_r"] - active["Difference_d"]

active.tail()

Unnamed: 0,Country/Region,Province/State,Lat,Long,Case_Type,Date,Cases,Difference,Lat_d,Long_d,Case_Type_d,Cases_d,Difference_d,Lat_r,Long_r,Case_Type_r,Cases_r,Difference_r
10345,Vietnam,,16.0,108.0,Active,2020-03-03,0,0,16.0,108.0,Deaths,0,0,16.0,108.0,Recovered,16,0
10346,Vietnam,,16.0,108.0,Active,2020-03-04,0,0,16.0,108.0,Deaths,0,0,16.0,108.0,Recovered,16,0
10347,Vietnam,,16.0,108.0,Active,2020-03-05,0,0,16.0,108.0,Deaths,0,0,16.0,108.0,Recovered,16,0
10348,Vietnam,,16.0,108.0,Active,2020-03-06,0,0,16.0,108.0,Deaths,0,0,16.0,108.0,Recovered,16,0
10349,Vietnam,,16.0,108.0,Active,2020-03-07,2,2,16.0,108.0,Deaths,0,0,16.0,108.0,Recovered,16,0


In [296]:
data = pd.concat([concated,active], join="inner")

data["Case_Type"].unique()

array(['Confirmed', 'Deaths', 'Recovered', 'Active'], dtype=object)

In [297]:
data["Last_Update_Date"] = datetime.utcnow()
data.to_csv("~/downloads/JHU_COVID-19_active.csv", index=False)