In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import json

pd.set_option('display.max_rows', 500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

In [2]:
data_path_git = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw = pd.read_csv(data_path_git)

In [3]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/1/22,6/2/22,6/3/22,6/4/22,6/5/22,6/6/22,6/7/22,6/8/22,6/9/22,6/10/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,180419,180520,180584,180615,180615,180688,180741,180784,180864,180864
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,276221,276221,276310,276342,276401,276415,276468,276518,276583,276638
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265887,265889,265889,265889,265897,265900,265904,265909,265920,265925
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,42894,42894,43067,43067,43067,43067,43067,43224,43224,43224
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,99761,99761,99761,99761,99761,99761,99761,99761,99761,99761
5,,Antarctica,-71.9499,23.347,0,0,0,0,0,0,...,11,11,11,11,11,11,11,11,11,11
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,8253,8295,8295,8378,8378,8378,8378,8378,8406,8479
7,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,9230573,9230573,9230573,9230573,9276618,9276618,9276618,9276618,9276618,9276618
8,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,422963,422963,422963,422963,422963,423006,423006,423006,423006,423006
9,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,133582,134286,134286,134286,134286,136860,137664,138407,138919,139894


In [8]:
data_path_rki = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/rki_covid_data.csv'
pd_raw_rki = pd.read_csv(data_path_rki)
pd_raw_rki

Unnamed: 0,Federal State,Electronically Submitted Cases,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,number,difference to the�previous day,cases,7-day,deaths
1,,,,in the,incidence,
2,,,,last,,
3,,,,7 days,,
4,Baden-Wuerttemberg,3714108,7826,26526,238.9,16182
5,Bavaria,4956999,10994,40540,308.5,24.167
6,Berlin,1061611,2.237,7585,207,4628
7,Brandenburg,797080,1290,4432,175.1,5688
8,Bremen,204310,752,2545,374.2,780
9,Hamburg,594566,2388,6.135,331.2,2679


In [9]:
page = requests.get('https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html')

In [10]:
soup = BeautifulSoup(page.content, 'html.parser')

In [11]:
html_table = soup.find('table')
all_rows = html_table.find_all('tr')

In [12]:
final_data_list=[]

In [13]:
for pos, rows in enumerate(all_rows):
    col_list = [each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)

In [14]:
final_data_list

[[],
 [],
 ['Baden-Württem\xadberg', '3.954.862', '16.465', '75.703', '681,8', '16.393'],
 ['Bayern', '5.278.363', '21.178', '98.294', '748,0', '24.537'],
 ['Berlin', '1.128.352', '3.500', '18.725', '511,0', '4.661'],
 ['Branden\xadburg', '839.781', '2.548', '12.312', '486,4', '5.728'],
 ['Bremen', '223.349', '1.117', '4.944', '726,9', '795'],
 ['Hamburg', '652.554', '2.965', '11.730', '633,2', '2.738'],
 ['Hessen', '2.102.381', '11.982', '55.799', '886,7', '10.442'],
 ['Meck\xadlenburg-Vor\xadpommern',
  '526.217',
  '1.731',
  '9.348',
  '580,3',
  '2.271'],
 ['Nieder\xadsachsen', '2.728.132', '13.990', '74.025', '924,9', '9.758'],
 ['Nord\xadrhein-West\xadfalen',
  '5.909.232',
  '22.940',
  '125.214',
  '698,5',
  '25.873'],
 ['Rhein\xadland-Pfalz', '1.283.407', '5.771', '26.268', '640,9', '5.778'],
 ['Saarland', '351.428', '1.874', '9.468', '962,2', '1.731'],
 ['Sachsen', '1.583.049', '3.630', '17.159', '423,0', '15.620'],
 ['Sachsen-Anhalt', '756.368', '1.865', '9.506', '435,9', 

In [15]:
pd.DataFrame(final_data_list).dropna().rename(columns={0:'State',1:'Total cases',2:'Difference to the previous day',3:'Cases in the last',
                                                      4:'7-day incidence',5:'Deaths'})

Unnamed: 0,State,Total cases,Difference to the previous day,Cases in the last,7-day incidence,Deaths
2,Baden-Württem­berg,3.954.862,16.465,75.703,6818,16.393
3,Bayern,5.278.363,21.178,98.294,7480,24.537
4,Berlin,1.128.352,3.5,18.725,5110,4.661
5,Branden­burg,839.781,2.548,12.312,4864,5.728
6,Bremen,223.349,1.117,4.944,7269,795.0
7,Hamburg,652.554,2.965,11.73,6332,2.738
8,Hessen,2.102.381,11.982,55.799,8867,10.442
9,Meck­lenburg-Vor­pommern,526.217,1.731,9.348,5803,2.271
10,Nieder­sachsen,2.728.132,13.99,74.025,9249,9.758
11,Nord­rhein-West­falen,5.909.232,22.94,125.214,6985,25.873


# REST API calls

In [16]:
data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronafälle_in_den_Bundesländern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [17]:
json_obj = json.loads(data.content)

In [18]:
type(json_obj)

dict

In [19]:
final_list=[]
for k,v in enumerate (json_obj['features'][:]):
        final_list.append(v['attributes'])              

In [20]:
pd.DataFrame(final_list)

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death,cases7_bl_per_100k,cases7_bl,death7_bl,cases7_bl_per_100k_txt,AdmUnitId
0,1,1,Schleswig-Holstein,Land,2910875,15,874629,1657231200000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,30046.944647,45737310000.0,2881496.0,2626,925.735389,26947,5,9257,1
1,2,2,Hamburg,Freie und Hansestadt,1852478,6,652554,1657231200000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,35226.005383,2089396000.0,418800.2,2738,633.2059,11730,6,6332,2
2,3,3,Niedersachsen,Land,8003421,9,2728132,1657231200000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,34087.073515,129983600000.0,4008988.0,9758,924.916982,74025,5,9249,3
3,4,4,Bremen,Freie Hansestadt,680130,5,223349,1657231200000,4,4132268b-54de-4327-ac1e-760e915112f1,32839.163101,1119157000.0,335717.7,795,726.919854,4944,1,7269,4
4,5,5,Nordrhein-Westfalen,Land,17925570,10,5909232,1657231200000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,32965.378507,87829360000.0,2648673.0,25873,698.521721,125214,22,6985,5
5,6,6,Hessen,Land,6293154,7,2102381,1657231200000,6,93277ac4-e8fc-48c7-8940-028dc2ed66af,33407.429724,52359130000.0,2148244.0,10442,886.661919,55799,5,8867,6
6,7,7,Rheinland-Pfalz,Land,4098391,11,1283407,1657231200000,7,e9b4296f-9be2-4e53-9a58-ccf1396cb03d,31314.898944,47838770000.0,1774430.0,5778,640.934455,26268,2,6409,7
7,8,8,Baden-Württemberg,Land,11103043,1,3954862,1657231200000,8,80394ddf-c6a4-4a6e-be8e-0259a81b22a9,35619.622476,81517320000.0,2544320.0,16393,681.822091,75703,5,6818,8
8,9,9,Bayern,Freistaat,13140183,2,5278363,1657231200000,9,1ff920f4-62cd-4a4f-b8c9-f042f2a3e00a,40169.630819,163485500000.0,3898618.0,24537,748.041332,98294,15,7480,9
9,10,10,Saarland,Land,983991,12,351428,1657231200000,10,e3396a6f-8a30-4fdf-8df7-def77dd38bea,35714.5543,6060692000.0,562678.9,1731,962.203923,9468,3,9622,10


## API access via REST service, e.g. USA data

In [None]:
headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '28ee4219700f48718be78b057beb7eb4',
}

response = requests.get('https://api.smartable.ai/coronavirus/stats/US', headers=headers)
print(response)