In [1]:
import os
import pandas as pd

In [2]:
data_dir = "./csse_covid_19_daily_reports_us"
output_file_path = "./time_series_covid19_tests_US_transformation.csv"

In [3]:
file_names = sorted(
    filter(lambda x: x.split(".")[-1] == "csv", os.listdir(data_dir)),
    key=lambda x: x[6:10] + x[:2] + x[3:5]
)
aggregated_list = []
last_num_tests = []
header = ["Province_State", "Country_Region", "Lat", "Long", "Date", "Number of Tests"]
for i, file_name in enumerate(file_names):
    df_daily = pd.read_csv(os.path.join(data_dir, file_name))

    # filter Province_State == "Diamond Princess" or "Grand Princess" or "Recovered"
    df_daily = df_daily[df_daily["Lat"].notna()]

    # align row order
    df_daily.sort_values(by="Province_State", inplace=True)

    assert "Total_Test_Results" in df_daily.columns or "People_Tested" in df_daily.columns
    for field in ["Total_Test_Results", "People_Tested"]:
        if field in df_daily.columns:
            if i > 0:
                df_daily["Number of Tests"] = (df_daily[field] - last_num_tests).astype("Int64")  # Int64 can handle nan
            last_num_tests = df_daily[field].values
            break

    # skip the first day (04-12-2020)
    if 0 == i:
        continue
    df_daily["Long"] = df_daily["Long_"]
    df_daily["Date"] = [file_name.split(".")[0]] * df_daily.shape[0]
    aggregated_list += df_daily[header].values.tolist()

df_aggregated = pd.DataFrame(aggregated_list, columns=header)
df_aggregated

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Number of Tests
0,Alabama,US,32.3182,-86.9023,04-13-2020,2607
1,Alaska,US,61.3707,-152.4044,04-13-2020,-208
2,American Samoa,US,-14.2710,-170.1320,04-13-2020,0
3,Arizona,US,33.7298,-111.4312,04-13-2020,898
4,Arkansas,US,34.9697,-92.3731,04-13-2020,1082
...,...,...,...,...,...,...
32363,Virginia,US,37.7693,-78.1700,11-11-2021,16609
32364,Washington,US,47.4009,-121.4905,11-11-2021,0
32365,West Virginia,US,38.4912,-80.9545,11-11-2021,15610
32366,Wisconsin,US,44.2685,-89.6165,11-11-2021,0


In [4]:
df_aggregated.to_csv(output_file_path, index=False)