In [4]:
import io
import os
import re
import urllib
import gzip
import datetime
import glob

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import mplleaflet


In [24]:
fig, ax = plt.subplots(1, 1, figsize=(12,10))

plt.plot(df.Lon, df.Lat, 'o', markersize=14)
mplleaflet.show()

In [20]:
def get_vf(l, i):
    vf_list = []
    for day in range(0, 31):
        V = int(l[i:i+5].strip())
        i += 5
        MF = l[i:i+1].strip()
        i += 1
        QF = l[i:i+1].strip()
        i += 1
        SF = l[i:i+1].strip()
        i += 1
        vf_list.append((day+1, V, MF, QF, SF))
    return vf_list

def get_data(l):
    ID = l[:11].strip()
    Y = int(l[11:15])
    M = int(l[15:17])
    E = l[17:21].strip()
    dlist = []
    for vf in get_vf(l, 21):
        try:
            d = {
                'ID': ID,
                'DATE': datetime.date(Y, M, vf[0]),
                'ELEMENT': E,
                'VALUE': vf[1],
                'MF': vf[2],
                'QF': vf[3],
                'SF': vf[4]
            } 
            dlist.append(d)
        except ValueError as ex:
            if 'day is out of range for' in str(ex):
                pass
            else:
                raise
    return dlist 
        
def read_dly(f):
    dlist = []
    with open(f) as fd:
        for l in fd: 
            dlist.extend(get_data(l))
    return pd.DataFrame(dlist)

def read_stations(f):
    #ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
    fmt = '''
        ID            1-11   Character
        LATITUDE     13-20   Real
        LONGITUDE    22-30   Real
        ELEVATION    32-37   Real
        STATE        39-40   Character
        NAME         42-71   Character
        GSN FLAG     73-75   Character
        HCN/CRN FLAG 77-79   Character
        WMO ID       81-85   Character
    '''
    colspecs = [(int(v1)-1, int(v2)) for v1, v2 in re.findall(r'(\d+)-(\d+)', fmt)]
    names = ['ID', 'LAT', 'LON', 'ELEV', 'STATE', 'NAME', 'GSNF', 'HCNF', 'WMOID']
    stations_df = pd.read_fwf(f, colspecs=colspecs, header=None, names=names)
    return stations_df

stations_df = read_stations('data/ghcnd-stations.txt')
stations_df_ch = stations_df[(stations_df.STATE == 'AZ') & (stations_df.NAME.str.contains('CHANDLER'))]

In [22]:
def read_temps(year, stations_df):
    f = 'data/{}.csv.gz'.format(year)
    with gzip.open(f) as fd:
        df = pd.read_csv(fd, names=['ID', 'DATE', 'ELEM', 'VALUE', 'F1', 'F2', 'F3', 'F4', 'F5'], header=None)
        df = df[df.ELEM.isin(['TMAX', 'TMIN']) & (df.ID.isin(stations_df.ID))]
        df = pd.merge(df, stations_df)
        return df
df = read_temps(2017, stations_df_ch)

In [24]:
df

Unnamed: 0,DATE,ELEM,VALUE,F1,F2,F3,F4,F5,ID,LAT,LON,ELEV,STATE,NAME,GSNF,HCNF,WMOID


In [16]:
df1.head()

Unnamed: 0,ID,DATE,ELEM,VALUE,F1,F2,F3,F4,F5,LAT,LON,ELEV,STATE,NAME,GSNF,HCNF,WMOID
0,USC00026653,20170101,TMAX,67,,,7,1600.0,,34.1003,-110.9658,1579.8,AZ,PLEASANT VALLEY,,,
1,USC00026653,20170101,TMIN,0,,,7,1600.0,,34.1003,-110.9658,1579.8,AZ,PLEASANT VALLEY,,,
2,USC00026653,20170102,TMAX,44,,,7,1600.0,,34.1003,-110.9658,1579.8,AZ,PLEASANT VALLEY,,,
3,USC00026653,20170102,TMIN,-39,,,7,1600.0,,34.1003,-110.9658,1579.8,AZ,PLEASANT VALLEY,,,
4,USC00026653,20170103,TMAX,94,,,7,1600.0,,34.1003,-110.9658,1579.8,AZ,PLEASANT VALLEY,,,
