# Dataset - Global

In [1]:
import pandas as pd

## Initialize

In [2]:
srcGlobal = "./time_series_covid19_confirmed_global.csv"
dest = "./time_series_covid19_confirmed_global_transformed.csv"

# Read data
globalDf = pd.read_csv(srcGlobal)
globalDf.head(20)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/24/21,2/25/21,2/26/21,2/27/21,2/28/21,3/1/21,3/2/21,3/3/21,3/4/21,3/5/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,55664,55680,55696,55707,55714,55733,55759,55770,55775,55827
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,103327,104313,105229,106215,107167,107931,108823,109674,110521,111301
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,112461,112622,112805,112960,113092,113255,113430,113593,113761,113948
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,10775,10799,10822,10849,10866,10889,10908,10948,10976,10998
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,20640,20695,20759,20782,20807,20854,20882,20923,20981,21026
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,646,701,701,726,730,769,769,769,813,813
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,2085411,2093645,2098728,2104197,2107365,2112023,2118676,2126531,2133963,2141854
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,170945,171227,171510,171793,172058,172216,172456,172816,173307,173749
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,118,118,118,118,118,118,120,120,120,120
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,5162,5166,5172,5177,5180,5183,5189,5193,5205,5207


## Data Manipulation

In [3]:
# Renaming columns
globalDf.rename(columns = {
    "Province/State": "Province_State", 
    "Country/Region": "Country_Region"
}, inplace = True)

# Name the "Province_State" as "Country_Region" if marked as NaN
globalDf["Province_State"] = globalDf["Province_State"].mask(pd.isnull, globalDf["Country_Region"])

# Derive confirmed cases per day and attach back to the source dataframe
locationsDf = globalDf.iloc[:, :4]
datesDf = globalDf.iloc[:, 4:].diff(axis = 1)
diffDf = pd.concat([locationsDf, datesDf], axis = 1)

# Transform spreading "date & confirmed cases" data into "Date" adn "Confirmed Cases"
globalDf = diffDf.melt(
    id_vars = ["Province_State", "Country_Region", "Lat", "Long"],
    var_name = "Date",
    value_name = "Confirmed Cases")

In [4]:
globalDf

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Confirmed Cases
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,
1,Albania,Albania,41.153300,20.168300,1/22/20,
2,Algeria,Algeria,28.033900,1.659600,1/22/20,
3,Andorra,Andorra,42.506300,1.521800,1/22/20,
4,Angola,Angola,-11.202700,17.873900,1/22/20,
...,...,...,...,...,...,...
112061,Vietnam,Vietnam,14.058324,108.277199,3/5/21,6.0
112062,West Bank and Gaza,West Bank and Gaza,31.952200,35.233200,3/5/21,1826.0
112063,Yemen,Yemen,15.552727,48.516388,3/5/21,36.0
112064,Zambia,Zambia,-13.133897,27.849332,3/5/21,654.0


## Save Dataframe

In [5]:
globalDf.to_csv(dest, index = False)