In [135]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required
import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import urllib.request
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn import preprocessing
from sklearn import metrics
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
from pathlib import Path

In [164]:
data = pd.read_csv("02-19-2023.csv")

In [165]:
geoJSONDict = {

  "type": "FeatureCollection",
  "crs": {
    "type": "name",
    "properties": {
      "name": "urn:ogc:def:crs:OGC:1.3:CRS84"
    }
  },
  "features": []
}


In [166]:
# incident rate = (Cases/population)*100,000
# population = (Cases*100,000)/incident rate
data["Population"] = (data["Confirmed"] * 100000)/data["Incident_Rate"]
data["Case_Fatality_Ratio"] = (data["Case_Fatality_Ratio"]*100).round()
data = data.sort_values(by='Case_Fatality_Ratio', ascending=False)
data.head(50)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Population
1386,90017.0,Unassigned,Illinois,US,2023-02-20 04:21:05,,,65,5038,,,"Unassigned, Illinois, US",,775077.0,
2936,90040.0,Unassigned,Oklahoma,US,2023-02-20 04:21:05,,,152,2173,,,"Unassigned, Oklahoma, US",,142961.0,
367,,,,"Korea, North",2023-02-20 04:21:05,40.3399,127.5101,1,6,,,"Korea, North",0.003879,60000.0,25778815.0
1075,90012.0,Unassigned,Florida,US,2023-02-20 04:21:05,,,3289,2124,,,"Unassigned, Florida, US",,6458.0,
33,,,Unknown,Belgium,2023-02-20 04:21:05,,,63090,33663,,,"Unknown, Belgium",,5336.0,
2108,90027.0,Unassigned,Minnesota,US,2023-02-20 04:21:05,,,902,417,,,"Unassigned, Minnesota, US",,4623.0,
381,,,,MS Zaandam,2023-02-20 04:21:05,,,9,2,,,MS Zaandam,,2222.0,
3119,72999.0,Unassigned,Puerto Rico,US,2023-02-20 04:21:05,,,26318,5771,,,"Unassigned, Puerto Rico, US",,2193.0,
4013,,,,Yemen,2023-02-20 04:21:05,15.552727,48.516388,11945,2159,,,Yemen,40.048994,1807.0,29825968.0
994,90008.0,Unassigned,Colorado,US,2023-02-20 04:21:05,,,256,23,,,"Unassigned, Colorado, US",,898.0,


In [167]:
data = data.drop([1386,2936,367])
data

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Population
1075,90012.0,Unassigned,Florida,US,2023-02-20 04:21:05,,,3289,2124,,,"Unassigned, Florida, US",,6458.0,
33,,,Unknown,Belgium,2023-02-20 04:21:05,,,63090,33663,,,"Unknown, Belgium",,5336.0,
2108,90027.0,Unassigned,Minnesota,US,2023-02-20 04:21:05,,,902,417,,,"Unassigned, Minnesota, US",,4623.0,
381,,,,MS Zaandam,2023-02-20 04:21:05,,,9,2,,,MS Zaandam,,2222.0,
3119,72999.0,Unassigned,Puerto Rico,US,2023-02-20 04:21:05,,,26318,5771,,,"Unassigned, Puerto Rico, US",,2193.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3852,90054.0,Unassigned,West Virginia,US,2021-07-31 23:21:38,,,0,0,,,"Unassigned, West Virginia, US",,,
3922,90055.0,Unassigned,Wisconsin,US,2023-02-20 04:21:05,,,0,0,,,"Unassigned, Wisconsin, US",,,
3954,90056.0,Unassigned,Wyoming,US,2023-01-08 23:21:00,,,0,0,,,"Unassigned, Wyoming, US",,,
3991,,,Channel Islands,United Kingdom,2023-02-20 04:21:05,49.3723,-2.3644,0,0,,,"Channel Islands, United Kingdom",0.0,,


In [168]:
us_data = data.loc[data['Country_Region']=='US']
data = data[data['Country_Region']!='US']
data

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Population
33,,,Unknown,Belgium,2023-02-20 04:21:05,,,63090,33663,,,"Unknown, Belgium",,5336.0,
381,,,,MS Zaandam,2023-02-20 04:21:05,,,9,2,,,MS Zaandam,,2222.0,
4013,,,,Yemen,2023-02-20 04:21:05,15.552727,48.516388,11945,2159,,,Yemen,40.048994,1807.0,29825968.0
422,,,Michoacan,Mexico,2023-02-20 04:21:05,19.566500,-101.706800,114366,9026,,,"Michoacan, Mexico",2370.082818,789.0,4825401.0
641,,,,Sudan,2023-02-20 04:21:05,12.862800,30.217600,63775,5011,,,Sudan,145.441421,786.0,43849269.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,,,Unknown,Mexico,2023-02-20 04:21:05,,,0,0,,,"Unknown, Mexico",,,
463,,,Unknown,Netherlands,2023-02-20 04:21:05,,,0,2,,,"Unknown, Netherlands",,,
639,,,Unknown,Spain,2023-02-20 04:21:05,,,0,0,,,"Unknown, Spain",,,
3991,,,Channel Islands,United Kingdom,2023-02-20 04:21:05,49.372300,-2.364400,0,0,,,"Channel Islands, United Kingdom",0.000000,,


In [160]:
us_data

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Population
1075,90012.0,Unassigned,Florida,US,2023-02-20 04:21:05,,,3289,2124,,,"Unassigned, Florida, US",,6458.0,
2108,90027.0,Unassigned,Minnesota,US,2023-02-20 04:21:05,,,902,417,,,"Unassigned, Minnesota, US",,4623.0,
3119,72999.0,Unassigned,Puerto Rico,US,2023-02-20 04:21:05,,,26318,5771,,,"Unassigned, Puerto Rico, US",,2193.0,
994,90008.0,Unassigned,Colorado,US,2023-02-20 04:21:05,,,256,23,,,"Unassigned, Colorado, US",,898.0,
2483,32029.0,Storey,Nevada,US,2023-02-20 04:21:05,39.448755,-119.525021,196,14,,,"Storey, Nevada, US",4753.820034,714.0,4123.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3243,90046.0,Unassigned,South Dakota,US,2023-02-20 04:21:05,,,0,0,,,"Unassigned, South Dakota, US",,,
3753,90051.0,Unassigned,Virginia,US,2020-12-21 13:27:30,,,0,0,,,"Unassigned, Virginia, US",,,
3852,90054.0,Unassigned,West Virginia,US,2021-07-31 23:21:38,,,0,0,,,"Unassigned, West Virginia, US",,,
3922,90055.0,Unassigned,Wisconsin,US,2023-02-20 04:21:05,,,0,0,,,"Unassigned, Wisconsin, US",,,


In [169]:
#data struct to store states
# for every state in us_data: 
# get average of Case_Fatality_Ratio, median of lat and long, country
# insert row into data

states =  us_data.Province_State.values.tolist()
states = list(dict.fromkeys(states))

for state in states:
    tempDf = us_data.loc[us_data['Province_State']==state]
    stateDf = tempDf.groupby('Province_State', as_index=False).Case_Fatality_Ratio.max()
    lat = tempDf.groupby('Province_State', as_index=False).Lat.median().iloc[0]['Lat']
    long = tempDf.groupby('Province_State', as_index=False).Long_.median().iloc[0]['Long_']
    country = "US"
    stateDf["Lat"] = lat
    stateDf["Long_"] = long
    stateDf["Country_Region"] = country
    stateDf
    data = data.append(stateDf, ignore_index=True)

  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append

  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)
  data = data.append(stateDf, ignore_index=True)


In [171]:
sampleDf = data.loc[data['Province_State']=='Florida']
data

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Population
0,,,Unknown,Belgium,2023-02-20 04:21:05,,,63090.0,33663.0,,,"Unknown, Belgium",,5336.0,
1,,,,MS Zaandam,2023-02-20 04:21:05,,,9.0,2.0,,,MS Zaandam,,2222.0,
2,,,,Yemen,2023-02-20 04:21:05,15.552727,48.516388,11945.0,2159.0,,,Yemen,40.048994,1807.0,29825968.0
3,,,Michoacan,Mexico,2023-02-20 04:21:05,19.566500,-101.706800,114366.0,9026.0,,,"Michoacan, Mexico",2370.082818,789.0,4825401.0
4,,,,Sudan,2023-02-20 04:21:05,12.862800,30.217600,63775.0,5011.0,,,Sudan,145.441421,786.0,43849269.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,,,Virgin Islands,US,,18.335800,-64.896300,,,,,,,53.0,
791,,,American Samoa,US,,-14.271000,-170.132000,,,,,,,41.0,
792,,,Northern Mariana Islands,US,,15.097900,145.673900,,,,,,,30.0,
793,,,Diamond Princess,US,,,,,,,,,,0.0,


In [174]:
clean_data = data.drop(['FIPS', 'Admin2','Province_State','Country_Region','Last_Update','Deaths','Recovered','Active','Combined_Key','Incident_Rate','Confirmed','Population'], axis=1)
clean_data = clean_data.dropna()
clean_data.loc[~(clean_data==0.0).all(axis=1)]

Unnamed: 0,Lat,Long_,Confirmed,Deaths,Case_Fatality_Ratio
2,15.552727,48.516388,11945.0,2159.0,1807.0
3,19.566500,-101.706800,114366.0,9026.0,789.0
4,12.862800,30.217600,63775.0,5011.0,786.0
5,19.041400,-98.206300,218997.0,16731.0,764.0
6,19.173800,-96.134200,234451.0,17141.0,731.0
...,...,...,...,...,...
721,50.973900,5.342000,348771.0,0.0,0.0
722,-7.109500,177.649300,2805.0,0.0,0.0
723,35.745200,95.995600,782.0,0.0,0.0
724,37.269200,106.165500,1276.0,0.0,0.0


In [163]:
clean_data = clean_data.sort_values(by='Case_Fatality_Ratio', ascending=False)
clean_data.head(50)

Unnamed: 0,Lat,Long_,Confirmed,Deaths,Recovered,Case_Fatality_Ratio


In [151]:
#The mistake made here is Im rendering EACH AND EVERY covid case 
#Instead of that, create an object with long, lat and total case count.
#Hence we should only have approx 4000 json objects instead of 600 million+
#TODO: Figure out how mapbox is gonna render it



for i in range(len(clean_data)):
    
    longitude = clean_data.iloc[i]['Long_']
    latitude = clean_data.iloc[i]['Lat']
    data_points = int(clean_data.iloc[i]['Case_Fatality_Ratio'])
    
    
    for x in range(data_points):
        geometry = {
        "geometry": {
          "coordinates": [
            longitude,
            latitude
          ]
        }
      }
        geoJSONDict["features"].append(geometry)
    

json_object = json.dumps(geoJSONDict, separators=(',', ":"))
with open("../server/case_fatality_ratio.json", "w") as outfile:
    outfile.write(json_object)
    

In [None]:
csv_total