In [1]:
%matplotlib inline

In [2]:
from pathlib import Path

import dask.dataframe as dd
import pandas as pd



In [3]:
# {column name:extents of the fixed-width fields}
columns = {"ID": (0,11), "LATITUDE": (12, 20), "LONGITUDE": (21, 30), "ELEVATION": (31, 37),"STATE": (38, 40),
           "NAME": (41, 71), "GSN FLAG": (72, 75), "HCN/CRN FLAG": (76, 79),"WMO ID": (80, 85)}
df = pd.read_fwf("http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt", 
                    colspecs=list(columns.values()), names=list(columns.keys())).dropna(subset=['STATE'])

In [4]:
nydf = df[df['STATE'].str.match("NY")]

In [5]:
#ny stations
nydf.head()

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN FLAG,HCN/CRN FLAG,WMO ID
82048,US1NYAB0001,42.667,-74.0509,445.0,NY,ALTAMONT 2.7 SSW,,,
82049,US1NYAB0006,42.7198,-73.9304,88.7,NY,SCHENECTADY 5.6 SSW,,,
82050,US1NYAB0010,42.5455,-74.1475,488.3,NY,RENSSELAERVILLE 2.1 NNW,,,
82051,US1NYAB0016,42.759,-73.737,104.2,NY,LATHAM 1.1 NNE,,,
82052,US1NYAB0017,42.6678,-73.7856,65.5,NY,ALBANY 0.7 E,,,


In [6]:
YEAR = 2021
elements = ["TAVG", "PRCP"]
dfs = {var : dd.read_parquet( f"s3://noaa-ghcn-pds/parquet/by_year/YEAR={YEAR}/ELEMENT={var}/", storage_options={"anon": True}) for var in elements}

In [7]:
dfs

{'TAVG': Dask DataFrame Structure:
                    ID    DATE DATA_VALUE  M_FLAG  Q_FLAG  S_FLAG OBS_TIME             YEAR          ELEMENT
 npartitions=4                                                                                              
                object  object      int64  object  object  object   object  category[known]  category[known]
                   ...     ...        ...     ...     ...     ...      ...              ...              ...
                   ...     ...        ...     ...     ...     ...      ...              ...              ...
                   ...     ...        ...     ...     ...     ...      ...              ...              ...
                   ...     ...        ...     ...     ...     ...      ...              ...              ...
 Dask Name: read-parquet, 1 graph layer,
 'PRCP': Dask DataFrame Structure:
                     ID    DATE DATA_VALUE  M_FLAG  Q_FLAG  S_FLAG OBS_TIME             YEAR          ELEMENT
 npartitions=23 

In [8]:
nyds = {var : dfs[var][dfs[var]['ID'].isin(nydf['ID'])] for var in elements}

In [9]:
data = nyds['TAVG'].merge(nyds['PRCP'], on=['ID', 'DATE', 'YEAR']).merge(nydf, on=['ID'])

In [10]:
data.columns

Index(['ID', 'DATE', 'DATA_VALUE_x', 'M_FLAG_x', 'Q_FLAG_x', 'S_FLAG_x',
       'OBS_TIME_x', 'YEAR', 'ELEMENT_x', 'DATA_VALUE_y', 'M_FLAG_y',
       'Q_FLAG_y', 'S_FLAG_y', 'OBS_TIME_y', 'ELEMENT_y', 'LATITUDE',
       'LONGITUDE', 'ELEVATION', 'STATE', 'NAME', 'GSN FLAG', 'HCN/CRN FLAG',
       'WMO ID'],
      dtype='object')

In [11]:
df = data[['ID',  'DATE', 'DATA_VALUE_x', 'ELEMENT_x','DATA_VALUE_y',  'ELEMENT_y', 'LATITUDE',
       'LONGITUDE', 'ELEVATION', 'STATE', 'NAME']].compute()

In [19]:
df.columns

Index(['ID', 'DATE', 'DATA_VALUE_x', 'ELEMENT_x', 'DATA_VALUE_y', 'ELEMENT_y',
       'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'NAME', 'TAVG', 'PRCP'],
      dtype='object')

In [14]:
df['TAVG'] = (df['DATA_VALUE_x'].astype(float)/10) *(9/5) + 32

In [16]:
df['PRCP'] = (df['DATA_VALUE_y'].astype(float)/10) * 0.039370

In [25]:
df[['ID', 'NAME',  'LATITUDE', 'LONGITUDE',  'DATE', 'TAVG', 'PRCP']].to_parquet(f"nydata_{YEAR}.parquet")

  if _pandas_api.is_sparse(col):


In [22]:
ls

 Volume in drive C is Windows
 Volume Serial Number is E60F-42E6

 Directory of C:\Users\story\Projects\team

03/20/2024  06:53 PM    <DIR>          .
12/03/2023  04:54 PM    <DIR>          ..
08/16/2023  04:18 PM               659 .gitignore
03/19/2024  09:52 PM    <DIR>          .ipynb_checkpoints
06/06/2023  06:59 PM    <DIR>          .vscode
06/07/2023  12:13 AM    <DIR>          _minted-main
03/06/2024  04:43 PM               188 environment.yml
05/17/2022  08:30 PM    <DIR>          matplottoy
03/20/2024  06:53 PM            23,856 nydata..parquet
03/19/2024  09:53 PM    <DIR>          paper
05/08/2023  03:39 AM                80 README.md
03/06/2023  03:19 PM    <DIR>          slides
03/06/2024  04:51 PM             3,173 team.code-workspace
11/15/2021  07:20 PM               475 todo.md
               6 File(s)         28,431 bytes
               8 Dir(s)  109,802,700,800 bytes free


In [24]:
ls

 Volume in drive C is Windows
 Volume Serial Number is E60F-42E6

 Directory of C:\Users\story\Projects\team

03/20/2024  06:54 PM    <DIR>          .
12/03/2023  04:54 PM    <DIR>          ..
08/16/2023  04:18 PM               659 .gitignore
03/19/2024  09:52 PM    <DIR>          .ipynb_checkpoints
06/06/2023  06:59 PM    <DIR>          .vscode
06/07/2023  12:13 AM    <DIR>          _minted-main
03/06/2024  04:43 PM               188 environment.yml
05/17/2022  08:30 PM    <DIR>          matplottoy
03/20/2024  06:53 PM            23,856 nydata..parquet
03/20/2024  06:54 PM            23,856 nydata.parquet
03/19/2024  09:53 PM    <DIR>          paper
05/08/2023  03:39 AM                80 README.md
03/06/2023  03:19 PM    <DIR>          slides
03/06/2024  04:51 PM             3,173 team.code-workspace
11/15/2021  07:20 PM               475 todo.md
               7 File(s)         52,287 bytes
               8 Dir(s)  109,802,676,224 bytes free
