In [None]:
import os
import sys
import pandas as pd
import requests
import glob
import numpy as np

In [None]:
sys.path.append("../../src")
from filepaths import downloads_folder
from download_data import download_helper

In [None]:
DOWNLOAD_BASE_URL = 'https://open-grid-emissions.s3.amazonaws.com'
YEARS = [2021, 2020, 2019]
ZIP_FILE_PATHS = [
    '{}_power_sector_data_hourly_{}_units.zip',
    '{}_carbon_accounting_hourly_{}_units.zip'
]

In [None]:
# Download all of the public links.

os.makedirs(downloads_folder('oge'), exist_ok=True)

for year in YEARS:
    for zip_path in ZIP_FILE_PATHS:
        for units in ('us', 'metric'):
            zipped_filename = zip_path.format(year, units)
            unzipped_filename = zipped_filename.replace('.zip', '')
            url = f'{DOWNLOAD_BASE_URL}/{zipped_filename}'
            print(url)
            download_helper(
                url,
                downloads_folder(f'oge/{zipped_filename}'),
                output_path=downloads_folder(f'oge/{unzipped_filename}'),
                requires_unzip=True
            )

In [None]:
# Make sure that all CSV files are readable and not empty.

csv_files = glob.glob(downloads_folder('oge/*/*.csv'))
# csv_files = ['/Users/milo.knowles/singularity/open-grid-emissions/data/downloads/oge/2021_power_sector_data_hourly_us_units/SRP.csv']

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)

        if len(df) == 0:
            print('FAIL: File is empty')
            raise ValueError()
        
        with open(csv_file, 'r') as f:
            line_lengths = [len(l) for l in f]
            median_line_length = np.median(line_lengths)
            
            for i, ll in enumerate(line_lengths):
                if ll > (2*median_line_length) and i > 0:
                    print('FAIL: Line {} is too long ({} chars). This is probably a corrupted line.'.format(i, ll))
                    raise ValueError()

    except BaseException as e:
        print(e)
        print(csv_file)

print('Done')
