In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## South Korea

In [3]:
response = requests.get('https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_South_Korea')
soup = BeautifulSoup(response.content, 'lxml')
jan = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-jan'})
feb = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-feb'})
march = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-mar'})
april = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr'})
april_2 = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr-l15'})

In [40]:
df_dict = dict()
df_dict['date'] = list()
df_dict['cases'] = list()
df_dict['deaths'] = list()
df_dict['recovered'] = list()

def populate_dict(dataset, country = None):
    for data in dataset:
        date = data.find('td').get_text().replace(',', '')
        if len(data.find_all('td')) >= 3:
            cases = data.find_all('td')[2].find('span', attrs={'class':'cbs-ibr'}).get_text().replace(',', '')
            if len(data.find_all('td')) >= 4:
                deaths = data.find_all('td')[3].find('span', attrs={'class':'cbs-ibr'}).get_text().replace(',', '')
            else:
                deaths = '0'
        else:
            cases = '0'
            deaths = '0'
            
        
        div_ = data.find_all('div')
        if len(div_) >= 3:
            if country == 'new_zealand':
                deaths = div_[0].get('title').replace(',', '') 
            recovered = div_[1].get('title').replace(',', '')
        else:
            recovered = '0'
        df_dict['date'].append(date)
        df_dict['cases'].append(cases)
        df_dict['deaths'].append(deaths)
        df_dict['recovered'].append(recovered)


In [None]:
populate_dict(jan)
populate_dict(feb)
populate_dict(march)
populate_dict(april)
populate_dict(april_2)

In [5]:
south_korea = pd.DataFrame(df_dict)
def populate_df(df):
    df['date'] = pd.to_datetime(df['date'])
    df['recovered'] = df['recovered'].replace('', '0').astype(int)
    df['cases'] = df['cases'].replace('', '0').astype(int)
    df['deaths'] = df['deaths'].replace('', '0').astype(int)
    df['active'] = df['cases'] - df['recovered'] - df['deaths']

    df = df.set_index('date')
    return df

In [35]:
south_korea = south_korea[~(south_korea['date'] == '⋮')]

south_korea = populate_df(south_korea)

idx = pd.date_range('01-20-2020', '04-20-2020')

south_korea = south_korea.reindex(idx)

south_korea = south_korea.fillna(method='ffill')

### save df

In [49]:
south_korea.to_csv('data/master_data/south_korea/from_wiki.csv')

## New York

In [50]:
response = requests.get('https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_New_York_(state)')
soup = BeautifulSoup(response.content, 'lxml')
march = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-mar'})
april = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr'})
april_2 = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr-l15'})

In [55]:
df_dict = dict()
df_dict['date'] = list()
df_dict['cases'] = list()
df_dict['deaths'] = list()
df_dict['recovered'] = list()

populate_dict(march)
populate_dict(april)
populate_dict(april_2)

In [56]:
new_york = pd.DataFrame(df_dict)

In [57]:
new_york = populate_df(new_york)

### Save df

In [59]:
new_york.to_csv('data/master_data/new_york/from_wiki.csv')

## India

In [60]:
response = requests.get('https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_India')
soup = BeautifulSoup(response.content, 'lxml')
jan = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-jan'})
feb = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-feb'})
march = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-mar'})
april = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr'})
april_2 = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr-l15'})

In [64]:
df_dict = dict()
df_dict['date'] = list()
df_dict['cases'] = list()
df_dict['deaths'] = list()
df_dict['recovered'] = list()

populate_dict(jan)
populate_dict(feb)
populate_dict(march)
populate_dict(april)
populate_dict(april_2)
india = pd.DataFrame(df_dict)

In [65]:
india = india[~(india['date'] == '⋮')]

india = populate_df(india)

idx = pd.date_range('01-30-2020', '04-20-2020')

india = india.reindex(idx)

india = india.fillna(method='ffill')
india = india.astype(int)

### Save df

In [69]:
india.to_csv('data/master_data/india/from_wiki.csv')

### New Zealand Data

In [16]:
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_New_Zealand'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
feb = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-feb'})
march = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-mar'})
april = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr'})
april_2 = soup.find_all('tr', attrs={'id': 'mw-customcollapsible-apr-l15'})

In [41]:
df_dict = dict()
df_dict['date'] = list()
df_dict['cases'] = list()
df_dict['deaths'] = list()
df_dict['recovered'] = list()

populate_dict(feb, country='new_zealand')
populate_dict(march, country='new_zealand')
populate_dict(april, country='new_zealand')
populate_dict(april_2, country='new_zealand')
new_zealand = pd.DataFrame(df_dict)

In [25]:
def preprocess(df, idx_range):
    df = df[~(df['date'] == '⋮')]

    df = populate_df(df)
    
    df=df[~df.index.duplicated(keep='last')]

    idx = pd.date_range(idx_range[0], idx_range[1])

    df = df.reindex(idx)

    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    df = df.astype(int)
    return df

In [42]:
idx_range = ('02-28-2020', '04-22-2020')
new_zealand = preprocess(new_zealand, idx_range)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

### save df

In [44]:
new_zealand.to_csv('data/master_data/new_zealand/from_wiki.csv')

In [45]:
new_zealand

Unnamed: 0,cases,deaths,recovered,active
2020-02-28,1,0,0,1
2020-02-29,1,0,0,1
2020-03-01,1,0,0,1
2020-03-02,1,0,0,1
2020-03-03,1,0,0,1
2020-03-04,2,0,0,2
2020-03-05,4,0,0,4
2020-03-06,4,0,0,4
2020-03-07,5,0,0,5
2020-03-08,5,0,0,5
