In [427]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required
import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import urllib.request
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn import preprocessing
from sklearn import metrics
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
from pathlib import Path

In [428]:
data = pd.read_csv("02-19-2023.csv")

In [429]:
geoJSONDict = {

  "type": "FeatureCollection",
  "crs": {
    "type": "name",
    "properties": {
      "name": "urn:ogc:def:crs:OGC:1.3:CRS84"
    }
  },
  "features": []
}


In [430]:
# incident rate = (Cases/population)*100,000
# population = (Cases*100,000)/incident rate
#data["Population"] = (data["Confirmed"] * 1000)/data["Incident_Rate"]
#data["Incident_Rate"] = (data["Deaths"] / data["Population"])* 1000
data["Case_Fatality_Ratio"] = (data["Case_Fatality_Ratio"]*100).round()
data = data[data['Confirmed'] >= data['Deaths']]
data = data.sort_values(by='Incident_Rate', ascending=False)
data.head(50)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
3495,48301.0,Loving,Texas,US,2023-02-20 04:21:05,31.849476,-103.581857,396,1,,,"Loving, Texas, US",234319.526627,25.0
755,2068.0,Denali,Alaska,US,2023-02-20 04:21:05,63.67264,-150.007611,1801,2,,,"Denali, Alaska, US",85884.597043,11.0
765,2180.0,Nome,Alaska,US,2023-02-20 04:21:05,64.903207,-164.03538,7547,7,,,"Nome, Alaska, US",75439.82407,9.0
767,2188.0,Northwest Arctic,Alaska,US,2023-02-20 04:21:05,67.049192,-159.750395,5747,22,,,"Northwest Arctic, Alaska, US",75410.051174,38.0
3468,48247.0,Jim Hogg,Texas,US,2023-02-20 04:21:05,27.044539,-98.696819,3792,23,,,"Jim Hogg, Texas, US",72923.076923,61.0
1106,13053.0,Chattahoochee,Georgia,US,2023-02-20 04:21:05,32.343412,-84.788092,7886,24,,,"Chattahoochee, Georgia, US",72302.191253,30.0
189,,,Faroe Islands,Denmark,2023-02-20 04:21:05,61.8926,-6.9118,34658,28,,,"Faroe Islands, Denmark",70926.020669,8.0
750,2050.0,Bethel,Alaska,US,2023-02-20 04:21:05,60.909805,-159.856183,12837,47,,,"Bethel, Alaska, US",69819.427826,37.0
606,,,,San Marino,2023-02-20 04:21:05,43.9424,12.4578,23521,122,,,San Marino,69305.792916,52.0
17,,,,Austria,2023-02-20 04:21:05,47.5162,14.5501,5863481,21820,,,Austria,65103.493072,37.0


In [431]:
data.loc[data['Country_Region']=='Chad']

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
94,,,,Chad,2023-02-20 04:21:05,15.4542,18.7322,7661,194,,,Chad,46.639874,253.0


In [432]:
data["Deaths"].sum()

6854432

In [433]:
data["Case_Fatality_Ratio"].sum()

574322.0

In [434]:
data["Incident_Rate"].sum()

111222738.00460437

In [435]:
us_data = data.loc[data['Country_Region']=='US']
data = data[data['Country_Region']!='US']

In [436]:
us_data

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
3495,48301.0,Loving,Texas,US,2023-02-20 04:21:05,31.849476,-103.581857,396,1,,,"Loving, Texas, US",234319.526627,25.0
755,2068.0,Denali,Alaska,US,2023-02-20 04:21:05,63.672640,-150.007611,1801,2,,,"Denali, Alaska, US",85884.597043,11.0
765,2180.0,Nome,Alaska,US,2023-02-20 04:21:05,64.903207,-164.035380,7547,7,,,"Nome, Alaska, US",75439.824070,9.0
767,2188.0,Northwest Arctic,Alaska,US,2023-02-20 04:21:05,67.049192,-159.750395,5747,22,,,"Northwest Arctic, Alaska, US",75410.051174,38.0
3468,48247.0,Jim Hogg,Texas,US,2023-02-20 04:21:05,27.044539,-98.696819,3792,23,,,"Jim Hogg, Texas, US",72923.076923,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3753,90051.0,Unassigned,Virginia,US,2020-12-21 13:27:30,,,0,0,,,"Unassigned, Virginia, US",,
3798,90053.0,Unassigned,Washington,US,2023-02-20 04:21:05,,,3303,6,,,"Unassigned, Washington, US",,18.0
3852,90054.0,Unassigned,West Virginia,US,2021-07-31 23:21:38,,,0,0,,,"Unassigned, West Virginia, US",,
3922,90055.0,Unassigned,Wisconsin,US,2023-02-20 04:21:05,,,0,0,,,"Unassigned, Wisconsin, US",,


In [437]:
#data struct to store states
# for every state in us_data: 
# get average of Case_Fatality_Ratio, median of lat and long, country
# insert row into data

states =  us_data.Province_State.values.tolist()
states = list(dict.fromkeys(states))

for state in states:
    tempDf = us_data.loc[us_data['Province_State']==state]
    stateDf = tempDf.groupby('Province_State', as_index=False).Case_Fatality_Ratio.sum()
    lat = tempDf.groupby('Province_State', as_index=False).Lat.median().iloc[0]['Lat']
    long = tempDf.groupby('Province_State', as_index=False).Long_.median().iloc[0]['Long_']
  #  country = "US"
    stateDf["Lat"] = lat
    stateDf["Long_"] = long
   # stateDf["Country_Region"] = country
    stateDf
    data = data.append(stateDf, ignore_index=True)

  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append

  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append

In [438]:
clean_data = data.drop(['FIPS', 'Admin2','Province_State','Country_Region','Last_Update','Incident_Rate','Recovered','Active','Combined_Key','Deaths','Confirmed'], axis=1)
clean_data = clean_data.dropna()
clean_data.loc[~(clean_data==0.0).all(axis=1)]

Unnamed: 0,Lat,Long_,Case_Fatality_Ratio
0,61.892600,-6.911800,8.0
1,43.942400,12.457800,52.0
2,47.516200,14.550100,37.0
3,46.151200,14.995500,53.0
4,4.535300,114.727700,8.0
...,...,...,...
784,44.492680,-69.654828,1552.0
785,38.904178,-77.016560,80.0
786,15.097900,145.673900,30.0
787,18.335800,-64.896300,53.0


In [439]:
clean_data = clean_data.sort_values(by='Case_Fatality_Ratio', ascending=False)
clean_data.head(50)

Unnamed: 0,Lat,Long_,Case_Fatality_Ratio
733,31.783209,-98.543035,47602.0
735,32.749765,-83.68845,32006.0
763,38.428825,-92.57215,19266.0
744,37.375699,-78.173251,18537.0
739,29.459336,-82.31831,16503.0
754,38.481194,-98.086073,16052.0
775,42.036571,-93.46499,15601.0
736,37.579299,-84.869131,15309.0
771,43.987942,-85.049805,14008.0
766,41.22781,-98.527989,13775.0


In [440]:
#The mistake made here is Im rendering EACH AND EVERY covid case 
#Instead of that, create an object with long, lat and total case count.
#Hence we should only have approx 4000 json objects instead of 600 million+
#TODO: Figure out how mapbox is gonna render it



for i in range(len(clean_data)):
    
    longitude = clean_data.iloc[i]['Long_']
    latitude = clean_data.iloc[i]['Lat']
    data_points = int(clean_data.iloc[i]['Case_Fatality_Ratio'])
    
    
    for x in range(data_points):
        geometry = {
        "geometry": {
          "coordinates": [
            longitude,
            latitude
          ]
        }
      }
        geoJSONDict["features"].append(geometry)
    

json_object = json.dumps(geoJSONDict, separators=(',', ":"))
with open("../server/case_fatality_ratio.json", "w") as outfile:
    outfile.write(json_object)
    