In [27]:
!ls

Business_Understanding.ipynb
Data_Preparation.ipynb
Data_Understanding.ipynb
Data_Understanding_EDA.ipynb
Evaluation_Walk_through.ipynb
Modeling_spread.ipynb


In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import subprocess
import os

pd.set_option('display.max_rows', 500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding 
* `Robert Koch Institute (webscraping)` https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* `John Hopkins (GITHUB)` https://github.com/CSSEGISandData/COVID-19.git
* `REST API services` to retrieve data https://npgeo-corona-npgeo-de.hub.arcgis.com/

# GITHUB csv data
To be done in terminal 
* cd `/Data_Science_COVID-19/data/raw/`
* git clone `https://github.com/CSSEGISandData/COVID-19.git`

In [29]:
git_pull = subprocess.Popen("/usr/bin/git pull",
                            cwd=os.path.dirname('../data/raw/COVID-19/'),
                            shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

(out, error) = git_pull.communicate()

print("Error : " + str(error))
print("out : " + str(out))

Error : b'The system cannot find the path specified.\r\n'
out : b''


In [30]:
data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw = pd.read_csv(data_path)

In [31]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/10/22,6/11/22,6/12/22,6/13/22,6/14/22,6/15/22,6/16/22,6/17/22,6/18/22,6/19/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,180864,180864,180864,181120,181178,181236,181465,181534,181574,181666
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,276638,276690,276731,276731,276821,276821,276821,277141,277141,277409
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265925,265925,265927,265937,265943,265952,265964,265968,265971,265975
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,43224,43224,43224,43224,43224,43449,43449,43449,43449,43449
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,99761,99761,99761,99761,99761,99761,99761,99761,99761,99761
5,,Antarctica,-71.9499,23.347,0,0,0,0,0,0,...,11,11,11,11,11,11,11,11,11,11
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,8479,8479,8492,8531,8537,8537,8537,8555,8581,8581
7,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,9276618,9276618,9276618,9276618,9313453,9313453,9313453,9313453,9313453,9341492
8,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,423006,423006,423006,423006,423044,423044,423044,423044,423044,423044
9,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,139894,140519,140519,141660,142629,143656,144597,145457,146264,147096


# Webscraping

In [32]:
# parsing html files
page = requests.get(
    "https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html"
)

In [33]:
soup = BeautifulSoup(page.content, 'html.parser')

In [34]:
html_table = soup.find('table')

In [35]:
all_rows = html_table.find_all('tr')

In [36]:
final_data_list = []

In [37]:
for pos, rows in enumerate(all_rows):
    col_list = [
        each_col.get_text(strip=True) for each_col in rows.find_all('td')
    ]
    final_data_list.append(col_list)

In [38]:
pd_daily_status = pd.DataFrame(final_data_list).dropna().rename(
    columns={
        0: 'state',
        1: 'cases',
        2: 'changes',
        3: 'cases_per_100k',
        4: 'fatal',
        5: 'comment'
    })  # to obtain correct table from scraped data

In [39]:
pd_daily_status.head()

Unnamed: 0,state,cases,changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,3.756.405,0,30.514,2748,16.225
3,Bayern,5.019.301,0,46.998,3577,24.232
4,Berlin,1.075.132,0,10.776,2941,4.636
5,Branden­burg,806.027,0,7.123,2814,5.703
6,Bremen,208.715,248,3.632,5340,782.0


# REST API Calls
* We use German Corona hub

In [4]:
data = requests.get(
    'https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronafälle_in_den_Bundesländern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json'
)

In [5]:
json_object = json.loads(data.content)

In [6]:
type(json_object)

dict

In [7]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [44]:
full_list = []
for pos, each_dict in enumerate(json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [45]:
pd_full_list = pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death,cases7_bl_per_100k,cases7_bl,death7_bl,cases7_bl_per_100k_txt,AdmUnitId
0,1,1,Schleswig-Holstein,Land,2910875,15,786715,1655676000000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,27026.753124,45737310000.0,2881496.0,2586,681.204105,19829,0,6812,1
1,2,2,Hamburg,Freie und Hansestadt,1852478,6,608227,1655676000000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,32833.156453,2089396000.0,418800.2,2695,415.011676,7688,2,4150,2
2,3,3,Niedersachsen,Land,8003421,9,2496938,1655676000000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,31198.383791,129983600000.0,4008988.0,9492,655.357253,52451,2,6554,3
3,4,4,Bremen,Freie Hansestadt,680130,5,208715,1655676000000,4,4132268b-54de-4327-ac1e-760e915112f1,30687.515622,1119157000.0,335717.7,782,534.015556,3632,0,5340,4
4,5,5,Nordrhein-Westfalen,Land,17925570,10,5502630,1655676000000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,30697.099172,87829360000.0,2648673.0,25588,475.460474,85229,5,4755,5


In [46]:
pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv', sep=';')

# API access via REST service for US Data
`https://rapidapi.com/SmartableAI/api/coronavirus-smartable/`

In [47]:
url = 'https://coronavirus-smartable.p.rapidapi.com/stats/v1/US/'

headers = {
    "X-RapidAPI-Key": "50a5b7f985mshd7c81a83a533eabp18ea2ejsnd71a5169c77e",
    "X-RapidAPI-Host": "coronavirus-smartable.p.rapidapi.com"
}

response = requests.request("GET", url, headers=headers)

In [48]:
print(response)

<Response [200]>


In [49]:
US_dict = json.loads(response.content)  #imports string
with open('../data/raw/SMARTABLE/US_data.txt', 'w') as outfile:
    json.dump(US_dict, outfile, indent=2)