In [1]:
import csv
import json
import pandas as pd
import numpy as np
from pprint import pprint
from collections import defaultdict, Counter
from dateutil.parser import *

def get_lineitem(d):
    
    
    popUS = None
    demogPop = None
    percentPop = None
    cases = None
    totalcases = None
    percentCases = None
    availableCases = None
    missingCases = None
    caserate = None
    deaths = None
    totaldeaths = None
    percentDeaths = None
    availableDeaths = None
    missingDeaths = None
    deathrate = None

    try:

        
        popUS = int(d["popUS"])
        demogPop = int(d["demogPop"])
        percentPop = float(d['percentPop'])
        cases = int(d['cases'])
        totalcases = int(d['totalcases'])
        percentCases = float(d["percentCases"])
        availableCases = int(d["availableCases"])
        missingCases = int(d["missingCases"])
        caserate = float(d["caserate"])
        deaths = int(d["deaths"])
        totaldeaths = int(d["totaldeaths"])
        percentDeaths = float(d["percentDeaths"])
        availableDeaths = int(d["availableDeaths"])
        missingDeaths = int(d["missingDeaths"])
        deathrate = float(d["deathrate"])

    except ValueError:
        
        popUS = 0
        demogPop = 0
        percentPop = 0
        cases = 0
        totalcases = 0
        percentCases = 0
        availableCases = 0
        missingCases = 0
        caserate = 0
        deaths = 0
        totaldeaths = 0
        percentDeaths = 0
        availableDeaths = 0
        missingDeaths = 0
        deathrate = 0

    return {
                "demogLabel": d['demogLabel'],
                "popUS" : popUS,
                "demogPop" : demogPop,
                "percentPop" : percentPop,
                "cases" : cases,
                "totalcases" : totalcases,
                "percentCases" : percentCases,
                "availableCases" : availableCases,
                "missingCases" : missingCases,
                "caserate" : caserate,
                "deaths" : deaths,
                "totaldeaths" : totaldeaths,
                "percentDeaths" : percentDeaths,
                "availableDeaths" : availableDeaths,
                "missingDeaths" : missingDeaths,
                "deathrate" : deathrate }

def demogdata(fn="USDemogData.csv"):
    
    data = defaultdict(list)
    agedata = defaultdict(list)
    racedata = defaultdict(list)
    with open(fn, "r") as fp:
        reader = csv.reader(fp)
        header = next(reader)
        for row in reader:
            d = {k:v.strip() for k, v in zip(header, row)}
            
            item = get_lineitem(d)
            k = ""            
            
            if d['demographicVar'] == "Race":
                if d['demogLabel'] != "":
                    k = d['demogLabel']
                racedata[k].append(item)
            if d['demographicVar'] == "Age":
                if d['demogLabel'] != "":
                    k = d['demogLabel']
                agedata[k].append(item)
                
        data['Race'].append(racedata)
        data['Age'].append(agedata)
            
    output = defaultdict(dict)
    for k, v in data.items():
        output[k]= v
        
    with open(f"../nationalDemogdata.json", "w") as fp:
            json.dump(output, fp, indent=2)
                
if __name__=="__main__":

    demogdata()
    demog_df = pd.read_csv("USDemogData.csv")
    exclude_race = ["NHPI", "Unknown Race"]
    max_race = demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'].idxmax()
    max_race_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][max_race]
    min_race = demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'].idxmin()
    min_race_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][min_race]
    max_race_value = max(demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'])
    min_race_value = min(demog_df[(demog_df['demographicVar'] == "Race") & 
                      (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_race)) == False])]['caserate'])

    race_descriptives = defaultdict(dict)
    race_descriptives[max_race_group] = max_race_value
    race_descriptives[min_race_group] = min_race_value
    
    exclude_age = ["Unknown"]
    age_sorted = np.sort(demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False])]['caserate'])

    age_sorted
    highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[-1])]['demogLabel'].values[0]

    second_highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[-2])]['demogLabel'].values[0]

    second_lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[1])]['demogLabel'].values[0]

    lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
                          (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
                     (demog_df['caserate'] == age_sorted[0])]['demogLabel'].values[0]


    age_descriptives = defaultdict(dict)

    age_descriptives[highest_age] = age_sorted[-1]
    age_descriptives[second_highest_age] = age_sorted[-2]
    age_descriptives[second_lowest_age] = age_sorted[1]
    age_descriptives[lowest_age] = age_sorted[0]

    demog_all = defaultdict(list)
    demog_all['Race'].append(race_descriptives)
    demog_all['Age'].append(age_descriptives)
    

    with open(f"../../../src/components/Pre-Processed Data/demogDescriptives.json", "w") as fp:
        json.dump(demog_all, fp, indent=2)

In [2]:
# exclude_age = ["Unknown"]
# age_sorted = np.sort(demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False])]['caserate'])
    
# age_sorted
# highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[-1])]['demogLabel'].values[0]

# second_highest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[-2])]['demogLabel'].values[0]

# second_lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[1])]['demogLabel'].values[0]

# lowest_age = demog_df[(demog_df['demographicVar'] == "Age") & 
#                       (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude_age)) == False]) &
#                  (demog_df['caserate'] == age_sorted[0])]['demogLabel'].values[0]


# age_descriptives = defaultdict(dict)

# age_descriptives[highest_age] = age_sorted[-1]
# age_descriptives[second_highest_age] = age_sorted[-2]
# age_descriptives[second_lowest_age] = age_sorted[1]
# age_descriptives[lowest_age] = age_sorted[0]
# age_descriptives

In [3]:
#    national_report.insert_one({"Title": "racedataAll", "racedataAll": data})


In [4]:
# demog_df = pd.read_csv("USDemogData.csv")

In [5]:

# demog_df #'caserate'

In [6]:
# exclude = ["NHPI", "Unknown Race"]
# max_id = demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'].idxmax()
# max_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][max_id]
# min_id = demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'].idxmin()
# min_group = demog_df[demog_df['demographicVar'] == "Race"]['demogLabel'][min_id]
# max_value = max(demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'])
# min_value = min(demog_df[(demog_df['demographicVar'] == "Race") & 
#                   (demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False])]['caserate'])

# demog_descriptives = defaultdict(dict)
# demog_descriptives[max_group] = max_value
# demog_descriptives[min_group] = min_value

# with open(f"../demogDescriptives.json", "w") as fp:
#     json.dump(demog_descriptives, fp, indent=2)

In [7]:

# demog_df['demogLabel'][demog_df['demogLabel'].str.contains('|'.join(exclude)) == False]