In [9]:
import os
import logging
from datetime import datetime, timedelta
from sqlalchemy import create_engine
import pyarrow.csv as pv
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pandas as pd

URL_PREFIX = 'https://noaa-ghcn-pds.s3.amazonaws.com'
TEMP_STORAGE_PATH = os.getenv('TEMP_STORAGE_PATH', './data')
START_YEAR = int(os.getenv("START_YEAR", "1760"))
PG_SCHEMA = 'ghcnd'

csv_schema = {
    'id': 'string',
    'date': 'string',
    'element': 'string',
    'value': 'int32',
    'm_flag': 'string',
    'q_flag': 'string',
    's_flag': 'string',
    'obs_time': 'int16',
}

def format_to_parquet(**kwargs):
    year = kwargs['year']
    src_file = kwargs['src_file']
    header = kwargs['column_names']
    column_types = kwargs['column_types']
    if not src_file.endswith('.csv'):
        logging.error("Can only accept source files in CSV format, for the moment")
        return
    table = pv.read_csv(
      src_file,
      read_options = pv.ReadOptions(column_names=column_names),
      convert_options = pv.ConvertOptions(column_types=column_types)
    )
    table = table \
        .append_column('parsed_date', pc.strptime(table.column("date"), format='%Y%m%d', unit='s').cast('date32')) \
        .drop(['date']) \
        .rename_columns(['id','element','value','m_flag','q_flag','s_flag','obs_time','date'])
    pq.write_table(table, src_file.replace('.csv', '.parquet'))

def parquet_to_pg(**kwargs):

    src_file = kwargs['src_file']
    table_name = kwargs['table_name']
    df = pq.read_table(src_file).to_pandas()
    print('read done')
    print(df.dtypes)
    dbschema = PG_SCHEMA
    user = "root"
    password = "root"
    host = "localhost"
    port = 5432
    db = "ghcn-d"
    cmd = f'postgresql://{user}:{password}@{host}:{port}/{db}'
    engine = create_engine(cmd, connect_args={'options': f'-csearch_path={dbschema}'})
    df.to_sql(table_name, engine, if_exists='replace', chunksize=1000, index=True, method='multi')


In [2]:
year = 2022
column_names = ['id','date','element','value','m_flag','q_flag','s_flag','obs_time']
csv_file_name = f'/{year}.csv'
dataset_url = URL_PREFIX + '/csv' + csv_file_name
csv_file_path = TEMP_STORAGE_PATH + csv_file_name
parquet_file_name = csv_file_name.replace('.csv', '.parquet')
parquet_file_path = TEMP_STORAGE_PATH + parquet_file_name
table_name = f"{year}"

In [None]:
f"if ! [ -f {csv_file_path} ] ; then  curl -sS {dataset_url} > {csv_file_path};   fi"

In [None]:
format_to_parquet(**{"src_file": csv_file_path,
            "column_names": column_names,
            "column_types": csv_schema,
            "year": year})

In [None]:
parquet_to_pg(**{"src_file": parquet_file_path,
          "table_name": table_name})

In [3]:
df = pd.read_parquet(parquet_file_path)

In [34]:
a = df[0:100000]

In [35]:
len(a)

100000

In [36]:
dbschema = PG_SCHEMA
user = "root"
password = "root"
host = "localhost"
port = 5432
db = "ghcn-d"
cmd = f'postgresql://{user}:{password}@{host}:{port}/{db}'
engine = create_engine(cmd, connect_args={'options': f'-csearch_path={dbschema}'})
engine.connect()

<sqlalchemy.engine.base.Connection at 0x266ac0bc370>

In [38]:
%time a.to_sql(table_name, engine, if_exists='replace', index=True)

CPU times: total: 2.73 s
Wall time: 5.15 s


1000

In [33]:
len(df)

10229506

In [24]:
a.size

800000