In [1]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

# Example Case: COVID-19 data extraction.

## Get data from JOHN HOPKINS UNIVERSITY

In [2]:
url = 'https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports'

In [3]:
response = requests.get(url)

In [4]:
html=response.content

In [5]:
soup = BeautifulSoup(html)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-feecb8f4bc5dce34742f7eae4fa0a799.css" integrity="sha512-/uy49LxdzjR0L36uT6CnmV1omP/8ZHxvOg4zq/dczzABHq9atntjJDmo5B7sV0J+AwVmv0fR0ZyW3EQawzdLFA==" media="all" rel="stylesheet"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/site-d03abe05da78c48128510e9df11cc331.css" integrity="sha512-0Dq+Bdp4xIEoUQ6d8RzDMUJdGejYZh

In [6]:
# get raw csv link from url
soup.find_all('a', attrs={'class':'js-navigation-open link-gray-dark'})[0]['href']

'/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/.gitignore'

In [7]:
csv_files = ['https://github.com'+ tag['href'] 
             for tag in soup.find_all('a', attrs={'class':'js-navigation-open link-gray-dark'})
             if tag['href'].endswith('.csv')]
csv_files

['https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-23-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-24-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-25-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-26-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-27-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-28-2020.csv',
 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-29-2020.csv',
 'https://github.com/CSSEGISandD

### Get data (only one day) from github

In [8]:
file = csv_files[0]
file

'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv'

In [9]:
response = requests.get(file)
html = response.content
soup = BeautifulSoup(html)

In [10]:
soup.find_all('div', attrs={'class':'BtnGroup'})[0].find_all('a')[0]['href']

'/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv'

In [11]:
csv_url = 'https://github.com'+ soup.find_all('div', attrs={'class':'BtnGroup'})[0].find_all('a')[0]['href']
csv_url

'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv'

In [12]:
df = pd.read_csv(csv_url)
df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,
4,Gansu,Mainland China,1/22/2020 17:00,,,


### filename cosidering date

In [13]:
date=re.findall('\d{2}-\d{2}-\d{4}', csv_url)[0].replace('-','_')
date

'01_22_2020'

In [14]:
filename = 'corona_' + date + '.csv'
filename

'corona_01_22_2020.csv'

## Some simple data cleaning (TRANSFORM)

In [15]:
colnames= df.rename({'Province/State': 'province',
           'Country/Region': 'country',
                    },
          axis=1).columns
colnames

Index(['province', 'country', 'Last Update', 'Confirmed', 'Deaths',
       'Recovered'],
      dtype='object')

In [16]:
df.columns = [col.lower() for col in colnames]
df.head(2)

Unnamed: 0,province,country,last update,confirmed,deaths,recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,


In [17]:
df.loc[df.country.str.contains('China'), 'country'] = 'China'
df.head(2)

Unnamed: 0,province,country,last update,confirmed,deaths,recovered
0,Anhui,China,1/22/2020 17:00,1.0,,
1,Beijing,China,1/22/2020 17:00,14.0,,


### pd.to_datetime( )

In [18]:
# normalize date format
df['last update']=pd.to_datetime(df['last update'])

In [19]:
df['last update'][0].month

1

In [20]:
str(df['last update'][0].month).zfill(2)

'01'

In [21]:
# create column
df['anomesdia'] = df['last update'].apply(lambda x : f'{str(x.year)}-{str(x.month).zfill(2)}-{str(x.day).zfill(2)}')
df.head()

Unnamed: 0,province,country,last update,confirmed,deaths,recovered,anomesdia
0,Anhui,China,2020-01-22 17:00:00,1.0,,,2020-01-22
1,Beijing,China,2020-01-22 17:00:00,14.0,,,2020-01-22
2,Chongqing,China,2020-01-22 17:00:00,6.0,,,2020-01-22
3,Fujian,China,2020-01-22 17:00:00,1.0,,,2020-01-22
4,Gansu,China,2020-01-22 17:00:00,,,,2020-01-22


## Store file

### logging

In [22]:
import logging

In [23]:
# Do basic configuration for the logging system
logging.basicConfig(level=logging.INFO)

In [24]:
logging.INFO

20

In [25]:
#Return a logger with the specified name, creating it if necessary
logger=logging.getLogger('Leonardo')

In [26]:
# Log 'msg % args' with severity 'INFO'
logger.info('Testing simple log.')

INFO:Leonardo:Testing simple log.


### Logging with time is important

In [27]:
# reset config: logging.root.handlers = []
logging.root.handlers=[]
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
                   datefmt='%Y-%m-%d %H:%M:%S')

In [28]:
logger =logging.getLogger('test_log')

In [29]:
logger.info('Testing log. This log should show time information')

2020-07-03 11:58:46.757 INFO <ipython-input-29-5fc59035c136> - <module>: Testing log. This log should show time information


### Logging to file is also important.

In [30]:
logging.root.handlers = []
logging.basicConfig(filename='test.log',
                    level=logging.INFO,
                    format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

logger = logging.getLogger('test_log')

logger.info('Testing - this log should go to a file.')

## Connect to database

In [31]:
from sqlalchemy import create_engine

In [32]:
engine = create_engine('postgresql+psycopg2://postgres:123@localhost/corona')
conn = engine.connect()

### Table name

In [33]:
filename

'corona_01_22_2020.csv'

In [34]:
table_name = filename.split('.')[0]
table_name

'corona_01_22_2020'

In [35]:
import os

In [36]:
rerun=False

In [37]:
# Return True if the given backend has a table of the given name: conn.engine.has_table(table_name)

if not conn.engine.has_table(table_name) or rerun:
    logging.info(f'Writing {table_name} to database.')
    df.to_sql(table_name, conn, if_exists='fail', index=False)
    logging.info(f'{table_name} written successfully.')
else:
    logging.info(f'Table {table_name} already in our database')

In [None]:
pd.read_spl