## Data Ingestion Notebook

In [1]:
!pip install seaborn geopy
from dateutil.relativedelta import relativedelta
import requests
import zipfile
import joblib
import datetime
import os

Collecting seaborn
  Downloading https://files.pythonhosted.org/packages/10/01/dd1c7838cde3b69b247aaeb61016e238cafd8188a276e366d36aa6bcdab4/seaborn-0.8.1.tar.gz (178kB)
[K    100% |████████████████████████████████| 184kB 5.8MB/s 
[?25hCollecting geopy
  Downloading https://files.pythonhosted.org/packages/02/99/e26ad8405591d55416e3bfb811a98edb016ad48f4b7fce381b8d9e311673/geopy-1.13.0-py2.py3-none-any.whl (73kB)
[K    100% |████████████████████████████████| 81kB 11.4MB/s 
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/5b/ac/4f348828091490d77899bc74e92238e2b55c59392f21948f296e94e50e2b/geographiclib-1.49.tar.gz
Building wheels for collected packages: seaborn, geographiclib
  Running setup.py bdist_wheel for seaborn ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/26/0a/44/53ddd89769e62f7c6691976375b86c6492e7dd20a2d3970e32
  Running setup.py bdist_wheel for geographiclib ... [?25ldone
[?25h  Stored in direct

In [2]:
def get_periods(start, end=None):
    """
    Returns a list of dates in the specific format required to download files.

    Args:
        start: Tuple: Form (YYYY, MM, DD)
        end: Tuple: Form (YYYY, MM, DD)

    Returns:
        dates: List: Dates in the format 'YYYYMMDD'
    """
    if not end:
        end_date = datetime.date.today()
    else:
        end_date = datetime.date(*end)
    
    dates = []
    date = datetime.date(*start)

    while date < end_date:
        dates.append(date.strftime('%Y%m%d'))
        date += relativedelta(months=1)

    return dates

In [3]:
start = (2010, 1, 1)
end = (2018, 1, 1)

dates = get_periods(start, end)

In [4]:
dates

['20100101',
 '20100201',
 '20100301',
 '20100401',
 '20100501',
 '20100601',
 '20100701',
 '20100801',
 '20100901',
 '20101001',
 '20101101',
 '20101201',
 '20110101',
 '20110201',
 '20110301',
 '20110401',
 '20110501',
 '20110601',
 '20110701',
 '20110801',
 '20110901',
 '20111001',
 '20111101',
 '20111201',
 '20120101',
 '20120201',
 '20120301',
 '20120401',
 '20120501',
 '20120601',
 '20120701',
 '20120801',
 '20120901',
 '20121001',
 '20121101',
 '20121201',
 '20130101',
 '20130201',
 '20130301',
 '20130401',
 '20130501',
 '20130601',
 '20130701',
 '20130801',
 '20130901',
 '20131001',
 '20131101',
 '20131201',
 '20140101',
 '20140201',
 '20140301',
 '20140401',
 '20140501',
 '20140601',
 '20140701',
 '20140801',
 '20140901',
 '20141001',
 '20141101',
 '20141201',
 '20150101',
 '20150201',
 '20150301',
 '20150401',
 '20150501',
 '20150601',
 '20150701',
 '20150801',
 '20150901',
 '20151001',
 '20151101',
 '20151201',
 '20160101',
 '20160201',
 '20160301',
 '20160401',
 '20160501',

### Download load data
Download electricity load data for the specified periods. Check if the relevant files already exist on disk and download only if they don't. Unzipping of the zip files is done when the data is downloaded.

In [6]:
def download_load_data(dates, save_path='/volumes/data/downloaded'):
    """
    Download load data from the internet.
    
    Args:
        dates: List: Dates to be used in the url substitution to download the data for the
            specific period.
        save_path: String: Path to save the downloaded files to.

    Returns:
        None
    """
    
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    periods = dates

    for period in periods:
        
        #zip_path = f'/volumes/data/{period}pal_csv.zip'
        zip_path = os.path.join(save_path, f'{period}pal_csv.zip')
        
        if os.path.exists(zip_path):
            print('File ' + os.path.basename(zip_path) + ' already exists. Not Downloading.')
            continue
        
        url = f'http://mis.nyiso.com/public/csv/pal/{period}pal_csv.zip'

        print(f"Retrieving load data...for period {period}")
        result = requests.get(url)

        with open(zip_path, 'wb') as f:  
            f.write(result.content)

        print("Extracting zipped contents...")
        zip_ref = zipfile.ZipFile(zip_path, 'r')
        zip_ref.extractall(os.path.join(save_path, 'load_data'))
        zip_ref.close()
        print("Done!")
        
download_load_data(dates)

File 20100101pal_csv.zip already exists. Not Downloading.
Retrieving load data...for period 20100201
Extracting zipped contents...
Done!
Retrieving load data...for period 20100301
Extracting zipped contents...
Done!
Retrieving load data...for period 20100401
Extracting zipped contents...
Done!
Retrieving load data...for period 20100501
Extracting zipped contents...
Done!
Retrieving load data...for period 20100601
Extracting zipped contents...
Done!
Retrieving load data...for period 20100701
Extracting zipped contents...
Done!
Retrieving load data...for period 20100801
Extracting zipped contents...
Done!
Retrieving load data...for period 20100901
Extracting zipped contents...
Done!
Retrieving load data...for period 20101001
Extracting zipped contents...
Done!
Retrieving load data...for period 20101101
Extracting zipped contents...
Done!
Retrieving load data...for period 20101201
Extracting zipped contents...
Done!
Retrieving load data...for period 20110101
Extracting zipped contents...


### Download Weather data (and Station data)

Download weather data for the same period as the load data if it doesn't already exist on disk otherwise skip.
Unzipping of the zip files is done when the data is downloaded.

In [None]:
def download_weather_data(dates, save_path='/volumes/data/downloaded'):
    """
    Download weather data from the internet.
    
    Args:
        dates: List: Dates to be used in the url substitution to download the data for the
            specific period.
        save_path: String: Path to save the downloaded files to.

    Returns:
        None
    """
    
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    periods = dates

    for period in periods:
    
        period = period[:6]
        zip_path = os.path.join(save_path, f'QCLD{period}.zip')
        
        if os.path.exists(zip_path):
            print('File ' + os.path.basename(zip_path) + ' already exists. Not Downloading.')
            continue
        
        url = f'https://www.ncdc.noaa.gov/orders/qclcd/QCLCD{period}.zip'

        print(f"Retrieving weather data...for period {period}")
        result = requests.get(url)

        with open(zip_path, 'wb') as f:  
            f.write(result.content)

        print("Extracting zipped contents...")
        zip_ref = zipfile.ZipFile(zip_path, 'r')
        zip_ref.extractall(os.path.join(save_path, 'weather_data'))
        zip_ref.close()
        print("Done!")
        
download_weather_data(dates)

Retrieving weather data...for period 201001
Extracting zipped contents...
Done!
Retrieving weather data...for period 201002
Extracting zipped contents...
Done!
Retrieving weather data...for period 201003
Extracting zipped contents...
Done!
Retrieving weather data...for period 201004
Extracting zipped contents...
Done!
Retrieving weather data...for period 201005
Extracting zipped contents...
Done!
Retrieving weather data...for period 201006
Extracting zipped contents...
Done!
Retrieving weather data...for period 201007
Extracting zipped contents...
Done!
Retrieving weather data...for period 201008
Extracting zipped contents...
Done!
Retrieving weather data...for period 201009
Extracting zipped contents...
Done!
Retrieving weather data...for period 201010
Extracting zipped contents...
Done!
Retrieving weather data...for period 201011
Extracting zipped contents...
Done!
Retrieving weather data...for period 201012
Extracting zipped contents...
Done!
Retrieving weather data...for period 201