# US Level Vaccination Data from JHU

The following script takes data from the repository of the 2020 JHU Vaccines Dashboard operated by Johns Hopkins University's Centers for Civic Impact.  

In [1]:
import pandas as pd
import datetime

In [2]:
# papermill parameters
output_folder = "../output/"

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/vaccine_data/us_data/time_series/vaccine_data_us_timeline.csv",
                skipinitialspace=True)

In [4]:
df['Country_Region'] = 'US'
df["Last_Update_Date"] = datetime.datetime.utcnow()

### Data cleansing

Remove mistyped unicode characters

In [5]:
df['Stage_One_Doses'] = df['Stage_One_Doses'].astype(str).str.replace(u"\u202C", "", regex=False).astype(float)



## Set Last_Reported_Date_Flag

In [6]:
df['Last_Reported_Flag'] = df["Date"].max() == df["Date"]

## Pivot data by vaccine type, flatten multi col, reset index

In [7]:
df = df.pivot(
    index=["Province_State", "Date", "FIPS", "Country_Region", "Lat", "Long_", "Stage_One_Doses", "Stage_Two_Doses", "Combined_Key", "Last_Update_Date", "Last_Reported_Flag"],
    columns=["Vaccine_Type"],
    values=["Doses_alloc", "Doses_shipped", "Doses_admin"]
)
df.columns = df.columns = [' '.join(col).strip() for col in df.columns.values]
df.columns = ["_".join(col.strip().split(" ")) for col in df.columns.values]
df.columns = df.columns.get_level_values(0)
df = df.reset_index()

In [14]:
col_map = {
    "Stage_One_Doses": "PEOPLE_TOTAL",
    "Stage_Two_Doses": "PEOPLE_TOTAL_2ND_DOSE",
    "Doses_alloc_All": "DOSES_ALLOC_TOTAL",
    "Doses_shipped_All": "DOSES_SHIPPED_TOTAL",
    "Doses_admin_All": "DOSES_ADMIN_ALL",
    "Doses_alloc_Johnson_&_Johnson": "DOSES_ALLOC_JOHNSON_AND_JOHNSON",
    "Doses_shipped_Johnson_&_Johnson": "DOSES_SHIPPED_JOHNSON_AND_JOHNSON",
    "Doses_admin_Johnson_&_Johnson": "DOSES_ADMIN_JOHNSON_AND_JOHNSON",
}
df = df.rename(columns=col_map)

## Output

Finally, we store the output in the `output` folder as `JHU_VACCINES.csv` as an unindexed CSV file.

In [16]:
df.to_csv(output_folder + "JHU_VACCINES.csv", index=False, columns=[
    "Date", "Province_State", "FIPS", 
    "DOSES_ALLOC_TOTAL", "Doses_alloc_Moderna", "Doses_alloc_Pfizer", "DOSES_ALLOC_JOHNSON_AND_JOHNSON", "Doses_alloc_Unassigned", "Doses_alloc_Unknown",
    "DOSES_SHIPPED_TOTAL", "Doses_shipped_Moderna", "Doses_shipped_Pfizer", "DOSES_SHIPPED_JOHNSON_AND_JOHNSON", "Doses_shipped_Unassigned", "Doses_shipped_Unknown", 
    "DOSES_ADMIN_ALL", "Doses_admin_Moderna", "Doses_admin_Pfizer", "DOSES_ADMIN_JOHNSON_AND_JOHNSON", "Doses_admin_Unassigned", "Doses_admin_Unknown",
    "PEOPLE_TOTAL", "PEOPLE_TOTAL_2ND_DOSE", "Country_Region", "Last_Update_Date", "Last_Reported_Flag"
])