In [119]:
import csv
import json
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict, Counter
from dateutil.parser import *

def get_lineitem(d):
    
    
    popUS = None
    demogPop = None
    percentPop = None
    cases = None
    totalcases = None
    percentCases = None
    availableCases = None
    missingCases = None
    caserate = None
    deaths = None
    totaldeaths = None
    percentDeaths = None
    availableDeaths = None
    missingDeaths = None
    deathrate = None

    try:

        
        popUS = int(d["popUS"])
        demogPop = int(d["demogPop"])
        percentPop = float(d['percentPop'])
        cases = int(d['cases'])
        totalcases = int(d['totalcases'])
        percentCases = float(d["percentCases"])
        availableCases = int(d["availableCases"])
        missingCases = int(d["missingCases"])
        caserate = float(d["caserate"])
        deaths = int(d["deaths"])
        totaldeaths = int(d["totaldeaths"])
        percentDeaths = float(d["percentDeaths"])
        availableDeaths = int(d["availableDeaths"])
        missingDeaths = int(d["missingDeaths"])
        deathrate = float(d["deathrate"])

    except ValueError:
        
        popUS = 0
        demogPop = 0
        percentPop = 0
        cases = 0
        totalcases = 0
        percentCases = 0
        availableCases = 0
        missingCases = 0
        caserate = 0
        deaths = 0
        totaldeaths = 0
        percentDeaths = 0
        availableDeaths = 0
        missingDeaths = 0
        deathrate = 0

    return {
                "demogLabel": d['demogLabel'],
                "popUS" : popUS,
                "demogPop" : demogPop,
                "percentPop" : percentPop,
                "cases" : cases,
                "totalcases" : totalcases,
                "percentCases" : percentCases,
                "availableCases" : availableCases,
                "missingCases" : missingCases,
                "caserate" : caserate,
                "deaths" : deaths,
                "totaldeaths" : totaldeaths,
                "percentDeaths" : percentDeaths,
                "availableDeaths" : availableDeaths,
                "missingDeaths" : missingDeaths,
                "deathrate" : deathrate }

def demogdata(fn="USDemogData.csv"):
    
    data = defaultdict(list)
    agedata = defaultdict(list)
    racedata = defaultdict(list)
    with open(fn, "r") as fp:
        reader = csv.reader(fp)
        header = next(reader)
        for row in reader:
            d = {k:v.strip() for k, v in zip(header, row)}
            
            item = get_lineitem(d)
            k = ""            
            
            if d['demographicVar'] == "Race":
                if d['demogLabel'] != "":
                    k = d['demogLabel']
                racedata[k].append(item)
            if d['demographicVar'] == "Age":
                if d['demogLabel'] != "":
                    k = d['demogLabel']
                agedata[k].append(item)
                
        data['Race'].append(racedata)
        data['Age'].append(agedata)
            
    output = defaultdict(dict)
    for k, v in data.items():
        output[k]= v
        
    with open(f"../nationalDemogdata.json", "w") as fp:
            json.dump(output, fp, indent=2)
                
if __name__=="__main__":

    demogdata()
    demog_df = pd.read_csv("USDemogData.csv")
    exclude_race = ["NHPI", "Unknown Race"]
    max_race = demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'].idxmax()
    max_race_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][max_race]
    min_race = demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'].idxmin()
    min_race_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][min_race]
    max_race_value = max(demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'])
    min_race_value = min(demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'])

    race_descriptives = defaultdict(dict)
    race_descriptives[max_race_group] = max_race_value
    race_descriptives[min_race_group] = min_race_value
    
    exclude_age = ["Unknown"]
    age_sorted = np.sort(demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False])]['caserate'])

    age_sorted
    highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[-1])]['demogLabel'].values[0]

    second_highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[-2])]['demogLabel'].values[0]

    second_lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[1])]['demogLabel'].values[0]

    lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[0])]['demogLabel'].values[0]


    age_descriptives = defaultdict(dict)

    age_descriptives[highest_age] = age_sorted[-1]
    age_descriptives[second_highest_age] = age_sorted[-2]
    age_descriptives[second_lowest_age] = age_sorted[1]
    age_descriptives[lowest_age] = age_sorted[0]

    demog_all = defaultdict(list)
    demog_all['Race'].append(race_descriptives)
    demog_all['Age'].append(age_descriptives)
    

    with open(f"../../../src/components/Pre-Processed Data/demogDescriptives.json", "w") as fp:
        json.dump(demog_all, fp, indent=2)

In [118]:
# exclude_age = ["Unknown"]
# age_sorted = np.sort(demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False])]['caserate'])
    
# age_sorted
# highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[-1])]['demogLabel'].values[0]

# second_highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[-2])]['demogLabel'].values[0]

# second_lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[1])]['demogLabel'].values[0]

# lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[0])]['demogLabel'].values[0]


# age_descriptives = defaultdict(dict)

# age_descriptives[highest_age] = age_sorted[-1]
# age_descriptives[second_highest_age] = age_sorted[-2]
# age_descriptives[second_lowest_age] = age_sorted[1]
# age_descriptives[lowest_age] = age_sorted[0]
# age_descriptives

defaultdict(dict,
            {'18 - 29 Years': 6447.87,
             '85+ Years': 6233.21,
             '5 - 17 Years': 2486.28,
             '0 - 4 Years': 1418.28})

In [None]:
#    national_report.insert_one({"Title": "racedataAll", "racedataAll": data})


In [19]:
# demog_df = pd.read_csv("USDemogData.csv")

In [24]:

# demog_df #'caserate'

Unnamed: 0,date,demographicVar,demographic,demogLabel,popUS,demogPop,percentPop,cases,totalcases,percentCases,availableCases,missingCases,caserate,deaths,totaldeaths,percentDeaths,availableDeaths,missingDeaths,deathrate
0,2021-01-05,Race,Hispanic,Hispanic,327167434,60572237,18.45,1690195,15294370,21.5,-9999,-9999,2790.38,26368,250603,13.5,-9999,-9999,43.53
1,2021-01-05,Race,Non-Hispanic American Natives,American Natives,327167434,2434908,0.74,98965,15294370,1.3,-9999,-9999,4064.42,1877,250603,1.0,-9999,-9999,77.09
2,2021-01-05,Race,Non-Hispanic Asian,Asian,327167434,18905879,5.76,265100,15294370,3.4,-9999,-9999,1402.21,8173,250603,4.2,-9999,-9999,43.23
3,2021-01-05,Race,Non-Hispanic African American,African American,327167434,41147488,12.54,991989,15294370,12.6,-9999,-9999,2410.81,32356,250603,16.6,-9999,-9999,78.63
4,2021-01-05,Race,Non-Hispanic NHPI,NHPI,327167434,595908,0.18,28884,15294370,0.4,-9999,-9999,4847.06,424,250603,0.2,-9999,-9999,71.15
5,2021-01-05,Race,Non-Hispanic White,White,327167434,197309822,60.11,4330999,15294370,55.2,-9999,-9999,2195.02,118452,250603,60.6,-9999,-9999,60.03
6,2021-01-05,Race,Unknown Race,Unknown Race,-9999,-9999,-9999.0,7447737,15294370,-9999.0,51,49,-9999.0,55138,250603,-9999.0,77,23,-9999.0
7,2021-01-05,Age,0 - 4 Years,0 - 4 Years,327167434,19767670,6.04,280360,15294370,1.8,-9999,-9999,1418.28,66,250603,0.0,-9999,-9999,0.33
8,2021-01-05,Age,5 - 17 Years,5 - 17 Years,327167434,53661722,16.4,1334182,15294370,8.8,-9999,-9999,2486.28,153,250603,0.1,-9999,-9999,0.29
9,2021-01-05,Age,18 - 29 Years,18 - 29 Years,327167434,53715647,16.42,3463514,15294370,22.8,-9999,-9999,6447.87,1296,250603,0.5,-9999,-9999,2.41


In [54]:
# exclude = ["NHPI", "Unknown Race"]
# max_id = demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'].idxmax()
# max_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][max_id]
# min_id = demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'].idxmin()
# min_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][min_id]
# max_value = max(demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'])
# min_value = min(demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'])

# demog_descriptives = defaultdict(dict)
# demog_descriptives[max_group] = max_value
# demog_descriptives[min_group] = min_value

# with open(f"../demogDescriptives.json", "w") as fp:
#     json.dump(demog_descriptives, fp, indent=2)

American Natives 4064.42 Asian 1402.21


In [48]:

# demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False]

0             Hispanic
1     American Natives
2                Asian
3     African American
5                White
7          0 - 4 Years
8         5 - 17 Years
9        18 - 29 Years
10       30 - 39 Years
11       40 - 49 Years
12       50 - 64 Years
13       65 - 74 Years
14       75 - 84 Years
15           85+ Years
16             Unknown
Name: demogLabel, dtype: object