## Southeast USA Daily Normals
Data Source: https://www.ncdc.noaa.gov/cdo-web/search?datasetid=NORMAL_DLY

References
* https://stackoverflow.com/questions/31511997/pandas-dataframe-replace-all-values-in-a-column-based-on-condition
* https://jakevdp.github.io/WhirlwindTourOfPython/14-strings-and-regular-expressions.html
* https://towardsdatascience.com/reverse-geocoding-in-python-a915acf29eb6

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import re
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import requests
import json

# Google developer API key
from config import gkey

### Functions

In [2]:
# This function takes in the station name as a string and extracts the state that the station is in
# This function returns the state two-letter code as a string
def findState(string):
    regex = re.compile(',\s([A-Z][A-Z])')
    state = regex.findall(string)
    return state[0]

### Cleaning Dataset(s)

In [3]:
# Read in first dataset
df1 = pd.read_csv('daily-normals-SE-1.csv')
df1.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-01,,,,,,
1,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-02,,,,,,
2,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-03,,,,,,
3,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-04,,,,,,
4,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-05,,,,,,


In [4]:
# Read in second dataset
df2 = pd.read_csv('daily-normals-SE-2.csv')
df2.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-01,,,,,,
1,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-02,,,,,,
2,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-03,,,,,,
3,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-04,,,,,,
4,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-05,,,,,,


In [5]:
# Read in third dataset
df3 = pd.read_csv('daily-normals-SE-3.csv')
df3.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-01,36.0,10.1,45.7,11.0,26.4,10.9
1,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-02,36.0,10.1,45.6,11.0,26.3,10.9
2,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-03,35.9,10.1,45.6,10.9,26.2,10.9
3,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-04,35.9,10.0,45.6,10.9,26.2,10.9
4,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-05,35.8,10.0,45.6,10.9,26.1,10.9


In [6]:
# Combine datasets
df = pd.concat([df1, df2, df3])
df

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-01,,,,,,
1,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-02,,,,,,
2,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-03,,,,,,
3,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-04,,,,,,
4,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-05,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
218103,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-27,,,,,,
218104,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-28,,,,,,
218105,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-29,,,,,,
218106,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-30,,,,,,


In [7]:
# Check for missing values
df.count()

STATION            418300
NAME               418300
LATITUDE           418300
LONGITUDE          418300
ELEVATION          418300
DATE               418300
DLY-TAVG-NORMAL    342576
DLY-TAVG-STDDEV    261690
DLY-TMAX-NORMAL    342576
DLY-TMAX-STDDEV    261690
DLY-TMIN-NORMAL    342576
DLY-TMIN-STDDEV    261690
dtype: int64

In [8]:
# Drop all rows that are missing daily normals (avg, max, min)
df_clean_1 = df.dropna(subset=['DLY-TAVG-NORMAL', 'DLY-TMAX-NORMAL', 'DLY-TMIN-NORMAL'])

In [9]:
# Check for additional missing values
df_clean_1.count()

STATION            342576
NAME               342576
LATITUDE           342576
LONGITUDE          342576
ELEVATION          342576
DATE               342576
DLY-TAVG-NORMAL    342576
DLY-TAVG-STDDEV    261690
DLY-TMAX-NORMAL    342576
DLY-TMAX-STDDEV    261690
DLY-TMIN-NORMAL    342576
DLY-TMIN-STDDEV    261690
dtype: int64

In [10]:
df_clean_1

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4
367,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5
368,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6
369,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6
370,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7
...,...,...,...,...,...,...,...,...,...,...,...,...
217371,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-27,38.3,,47.8,,28.7,
217372,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-28,38.1,,47.7,,28.6,
217373,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-29,38.0,,47.6,,28.5,
217374,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-30,37.9,,47.4,,28.4,


### Adding State Column

In [11]:
# Pull state out of station name and add to state column
df_clean_2 = df_clean_1.copy()
df_clean_2['STATE'] = df_clean_2['NAME'].apply(findState)
df_clean_2

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV
367,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV
368,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV
369,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV
370,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217371,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-27,38.3,,47.8,,28.7,,TN
217372,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-28,38.1,,47.7,,28.6,,TN
217373,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-29,38.0,,47.6,,28.5,,TN
217374,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-30,37.9,,47.4,,28.4,,TN


In [12]:
# Review sample to verify accurate state assignment
df_test_states = df_clean_2.sample(20)
df_test_states

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE
60753,USW00073803,"TALLADEGA 10 NNE, AL US",33.5721,-86.0573,160.0,01-07,42.3,,53.2,,31.4,,AL
186713,USC00409155,"TULLAHOMA, TN US",35.34528,-86.20889,311.5,03-18,49.8,9.0,61.4,10.5,38.1,9.6,TN
168155,USC00154208,"JAMESTOWN, KY US",37.0055,-85.0616,271.3,06-30,74.5,4.7,86.4,5.1,62.6,5.7,KY
180771,USC00013154,"GADSDEN, AL US",34.0219,-85.9878,172.2,12-21,42.9,9.5,53.1,10.6,32.7,10.2,AL
50700,USC00318500,"TARBORO 1 S, NC US",35.8847,-77.5386,10.7,07-13,79.7,4.0,89.4,5.1,70.1,4.5,NC
95046,USC00443267,"GALAX RADIO WBRF, VA US",36.6633,-80.9139,726.9,09-15,63.3,5.5,74.3,6.3,52.2,7.1,VA
40290,USC00155694,"MURRAY, KY US",36.6122,-88.3083,160.6,02-07,39.1,10.7,47.9,12.0,30.4,10.9,KY
104868,USC00150940,"BRADFORDSVILLE, KY US",37.495,-85.1516,201.2,07-24,76.4,4.4,87.9,5.2,64.9,5.2,KY
68406,USC00150402,"BARDWELL 2 E, KY US",36.883,-88.9961,125.0,12-06,40.0,9.4,49.8,10.3,30.2,10.4,KY
24644,USC00406162,"MONTEAGLE, TN US",35.2243,-85.8414,563.9,05-07,61.7,6.9,71.3,7.5,52.0,7.5,TN


In [13]:
# Determine states included in dataset
states = df_clean_2['STATE'].unique()
states

array(['WV', 'VA', 'SC', 'MD', 'NC', 'TN', 'GA', 'AL', 'KY', 'IN', 'IL'],
      dtype=object)

In [14]:
# Drop rows associated with states not in the Southwest
for state in ['MD', 'IN', 'IL']:
    df_clean_2.drop(df_clean_2[df_clean_2['STATE'] == state].index, inplace=True)

# Check that unwanted states were dropped
df_clean_2['STATE'].unique()

array(['WV', 'VA', 'SC', 'NC', 'TN', 'GA', 'AL', 'KY'], dtype=object)

## Find Counties and Zip Codes for Stations

In [15]:
# Pull out dataframe of stations metadata
stations = df_clean_2[['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'STATE']].drop_duplicates()
stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV
732,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA
1098,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA
1464,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA
2196,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV
...,...,...,...,...,...
214083,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL
215180,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY
215912,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY
216278,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY


In [16]:
# Create column that has coordinates combined in a string
stations['COORD'] = stations['LATITUDE'].map(str) + ',' + stations['LONGITUDE'].map(str)
stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV,"39.5064,-78.3158"
732,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA,"38.2022,-77.3747"
1098,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA,"37.3451,-77.9781"
1464,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA,"36.5869,-79.3886"
2196,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV,"39.6203,-79.9698"
...,...,...,...,...,...,...
214083,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL,"32.5811,-88.1897"
215180,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY,"37.7858,-84.7077"
215912,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY,"37.8194,-85.3847"
216278,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY,"36.8276,-85.8975"


In [17]:
# Set up reverse geocoding function
locator = Nominatim(user_agent='myGeocoder', timeout=10)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)

# Create columns for county and zip code
stations['COUNTY'] = ''
stations['ZIP'] = ''

# Pull county and zip code for each station
for index, row in stations.iterrows():
    try:
        coord = row['COORD']
        address = rgeocode(coord)
        county = address.raw['address']['county']
        zipcode = address.raw['address']['postcode']
        stations.loc[index, 'COUNTY'] = county
        stations.loc[index, 'ZIP'] = zipcode
        print(f"{row['NAME']} complete.")
    except:
        print(f"Cannot find {row['NAME']}.")
    
print('Reverse geocoding complete.')

Cannot find CACAPON STATE PARK 2, WV US.
CORBIN, VA US complete.
AMELIA COURTHOUSE 1, VA US complete.
Cannot find DANVILLE, VA US.
MORGANTOWN LOCK AND DAM, WV US complete.
ROWLESBURG 1, WV US complete.
HUNTINGTON SWG PLANT, WV US complete.
LONDON LOCKS, WV US complete.
BREAKS INTERSTATE PARK, VA US complete.
BOSTON 4 SE, VA US complete.
AMELIA 8 NE, VA US complete.
CHARLESTON INTL. AIRPORT, SC US complete.
MUSTOE 1 SW, VA US complete.
SANDHILL RESEARCH, SC US complete.
FLAT TOP, WV US complete.
OCEANA NAS, VA US complete.
GREENVILLE DOWNTOWN AIRPORT, SC US complete.
COLUMBIA METROPOLITAN AIRPORT, SC US complete.
FENTRESS NAVAL AUXILIARY FIELD, VA US complete.
BUCKEYE, WV US complete.
CHARLESTON WSFO, WV US complete.
CLINTWOOD 1 W, VA US complete.
SOUTH BOSTON, VA US complete.
BIG STONE GAP, VA US complete.
GREENBAY 3 NE, VA US complete.
PULASKI 2 E, VA US complete.
STONEWALL JACKSON DAM, WV US complete.
FREDERICKSBURG SEWAGE, VA US complete.
RIPLEY, WV US complete.
WILLIAMSON, WV US co

RADFORD 3 N, VA US complete.
LOST RIVER, WV US complete.
MARTINSVILLE FILTER PLANT, VA US complete.
ANSTEAD HAWKS NEST STATE PARK, WV US complete.
Cannot find RICHWOOD 1 SSE, WV US.
Cannot find JOCASSEE 8 WNW, SC US.
RICHLANDS, VA US complete.
STAFFORDSVILLE 3 ENE, VA US complete.
MORGANTOWN HART FIELD, WV US complete.
MARTINSBURG EASTERN WEST VIRGINIA REGIONAL AIRPORT, WV US complete.
LYNCHBURG INTERNATIONAL AIRPORT, VA US complete.
ORANGEBURG 2, SC US complete.
CHATHAM, VA US complete.
BLACKSBURG NATIONAL WEATHER SERVICE OFFICE, VA US complete.
ROCK CAVE 2 NE, WV US complete.
HAMLIN, WV US complete.
PENNINGTON GAP, VA US complete.
FARMVILLE 2 N, VA US complete.
CROZIER, VA US complete.
WAKEFIELD 1 NW, VA US complete.
STONY CREEK 2 N, VA US complete.
LEWISBURG 3 N, WV US complete.
SUMMERSVILLE LAKE, WV US complete.
WINTHROP UNIVERSITY, SC US complete.
WISE 1 SE, VA US complete.
RIDGEVILLE, SC US complete.
MCCORMICK, SC US complete.
YEMASSEE 1 N, SC US complete.
Cannot find LANGLEY AIR

MOUNTAIN CITY 2, TN US complete.
Cannot find HUNTSVILLE INTERNATIONAL AIRPORT JONES FIELD, AL US.
MONTEREY, TN US complete.
QUITMAN 2 NW, GA US complete.
CLARKESVILLE, GA US complete.
LAWRENCEBURG FILTER PLANT, TN US complete.
COLLIERVILLE, TN US complete.
TROY, AL US complete.
SAVANNAH 6 SW, TN US complete.
THORSBY EXPERIMENTAL STATION, AL US complete.
ELIZABETHTON, TN US complete.
LONDON CORBIN AIRPORT, KY US complete.
COLUMBIA 3 WNW, TN US complete.
STANTON 2 W, KY US complete.
MANCHESTER 4 W, KY US complete.
WAYCROSS WARE CO AIRPORT, GA US complete.
OAK RIDGE ATDD, TN US complete.
CODEN, AL US complete.
MONTEAGLE, TN US complete.
PRINCETON 1 SE, KY US complete.
CROSSVILLE MEMORIAL AIRPORT, TN US complete.
ALBANY SW GEORGIA REGIONAL AIRPORT, GA US complete.
Cannot find ASHBURN 3 ENE, GA US.
PARSONS WATER PLANT, TN US complete.
WARTRACE 6 E, TN US complete.
WARNER PARK, TN US complete.
CEDARTOWN, GA US complete.
ANNISTON METROPOLITAN AIRPORT, AL US complete.
Cannot find ALMA BACON CO

CORNELIA, GA US complete.
BEAVER DAM, KY US complete.
HENDERSON 8 SSW, KY US complete.
FARMERS 2 S, KY US complete.
MC MINNVILLE, TN US complete.
GLADEVILLE, TN US complete.
HAZLEHURST, GA US complete.
DICKSON, TN US complete.
LEITCHFIELD 2 N, KY US complete.
Cannot find FOLKSTON 9 SW, GA US.
Cannot find COOKEVILLE, TN US.
BRENTWOOD, TN US complete.
DAYTON 2 SE, TN US complete.
LYONS, GA US complete.
Cannot find JESUP 8 S, GA US.
HELENA, AL US complete.
MABLETON 1 N, GA US complete.
Cannot find CENTRE, AL US.
HUNTINGDON WATER PLA, TN US complete.
ALEXANDER CITY, AL US complete.
BROWNSVILLE, TN US complete.
CORDELE, GA US complete.
SAUTEE 3 W, GA US complete.
LOUISVILLE WEATHER FORECAST OFFICE, KY US complete.
GERMANTOWN 4 SE, TN US complete.
HEIDELBERG 2 N, KY US complete.
ROCK ISLAND STATE PARK, TN US complete.
Cannot find TOWNSEND 5 S, TN US.
DECATUR 5 SE, AL US complete.
TOCCOA, GA US complete.
HANCEVILLE, AL US complete.
LAFAYETTE 2 W, AL US complete.
HEFLIN, AL US complete.
TROY 2

In [40]:
# Preview dataframe
stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV,"39.5064,-78.3158",Alamance County,27258
1,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA,"38.2022,-77.3747",Avery County,28604
2,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA,"37.3451,-77.9781",Pitt County,27834
3,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA,"36.5869,-79.3886",New Hanover County,28409
4,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV,"39.6203,-79.9698",Beaufort County,27889
...,...,...,...,...,...,...,...,...
927,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL,"32.5811,-88.1897",Sumter County,35470
928,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY,"37.7858,-84.7077",,
929,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY,"37.8194,-85.3847",Nelson County,40004
930,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY,"36.8276,-85.8975",Barren County,42141


In [17]:
# Extract out stations that are missing county and zip code
missing_geocode = stations.loc[stations['COUNTY'].isnull()]
missing_geocode

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP
47,USC00440720,"BIG MEADOWS, VA US",38.52160,-78.43550,VA,"38.5216,-78.4355",,
52,USC00461900,"COOPERS ROCK STATE FOREST, WV US",39.67722,-79.77194,WV,"39.67722,-79.77194",,
55,USC00464971,"KUMBRABOW, WV US",38.63110,-80.08560,WV,"38.6311,-80.0856",,
56,USC00443310,"GATHRIGHT DAM, VA US",37.94580,-79.95420,VA,"37.9458,-79.9542",,
62,USC00468662,"SUTTON LAKE, WV US",38.66110,-80.69700,WV,"38.6611,-80.697",,
...,...,...,...,...,...,...,...,...
912,USW00003811,"JACKSON MCKELLAR SIPES REGIONAL AIRPORT, TN US",35.59300,-88.91670,TN,"35.593,-88.9167",,
917,USW00093808,"BOWLING GREEN WARREN CO AIRPORT, KY US",36.96470,-86.42380,KY,"36.9647,-86.4238",,
920,USC00016988,"ROBERTSDALE, AL US",30.56540,-87.70180,AL,"30.5654,-87.7018",,
925,USC00403938,"HARTSVILLE, TN US",36.37556,-86.18083,TN,"36.37556,-86.18083",,


In [47]:
# Create empty list to store any calls that don't work
missing_stations = []

# Use reverse geocoding using Google's Geocoding API to find the rest of the zipcodes and counties
for coord in missing_geocode['COORD']:
    # Assemble url and make API call based on coordinates
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={coord}&key={gkey}'
    response = requests.get(url)
    geocoding = response.json()
    
    try:
        # Find zipcode and county within JSON response
        for dict in geocoding['results'][0]['address_components']:
            if 'administrative_area_level_2' in dict['types']:
                county = dict['long_name']
            if 'postal_code' in dict['types']:
                zipcode = dict['long_name']
        # Add zipcode and county to dataframe
        stations.loc[stations['COORD'] == coord, 'COUNTY'] = county
        stations.loc[stations['COORD'] == coord, 'ZIP'] = zipcode

        print(f'Added {coord} to DataFrame.')
    
    except:
        # Store any problematic coordinates
        missing_dict = {}
        missing_dict['JSON'] = geocoding
        missing_dict['COORD'] = coord
        missing_stations.append(missing_dict)
        print(f'Could not add {coord} to DataFrame')

print('Finished Processing')

Added 38.5216,-78.4355 to DataFrame.
Added 39.67722,-79.77194 to DataFrame.
Added 38.6311,-80.0856 to DataFrame.
Added 37.9458,-79.9542 to DataFrame.
Added 38.6611,-80.697 to DataFrame.
Added 33.1532,-79.3637 to DataFrame.
Added 37.3169,-79.9741 to DataFrame.
Added 33.5214,-79.0975 to DataFrame.
Added 37.7836,-81.123 to DataFrame.
Added 39.47,-80.8571 to DataFrame.
Added 37.13194,-76.49306 to DataFrame.
Added 38.9817,-81.5659 to DataFrame.
Added 38.57,-79.2758 to DataFrame.
Added 37.0375,-77.9462 to DataFrame.
Added 37.3818,-79.233 to DataFrame.
Added 36.7297,-76.6015 to DataFrame.
Added 39.0736,-79.634 to DataFrame.
Added 38.6844,-82.1837 to DataFrame.
Added 35.0316,-81.4927 to DataFrame.
Added 36.5955,-76.4386 to DataFrame.
Added 38.2869,-79.8181 to DataFrame.
Added 38.7334,-77.4925 to DataFrame.
Added 36.6633,-80.9139 to DataFrame.
Added 36.6542,-80.9183 to DataFrame.
Added 32.48333,-80.71667 to DataFrame.
Added 32.6064,-80.3267 to DataFrame.
Added 37.2992,-77.2775 to DataFrame.
Add

In [50]:
# Check if any json requests were unsuccessful
missing_stations

[{'JSON': {'error_message': 'You have exceeded your rate-limit for this API.',
   'results': [],
   'status': 'OVER_QUERY_LIMIT'},
  'COORD': '35.5453,-82.6987'},
 {'JSON': {'error_message': 'You have exceeded your rate-limit for this API.',
   'results': [],
   'status': 'OVER_QUERY_LIMIT'},
  'COORD': '34.15,-85.6846'}]

In [51]:
# Retry API calls which didn't work

# Create empty list to store any API calls that don't work
missing_stations_2 = []

for entry in missing_stations:
    coord = entry['COORD']
    # Assemble url and make API call based on coordinates
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={coord}&key={gkey}'
    response = requests.get(url)
    geocoding = response.json()
    
    try:
        # Find zipcode and county within JSON response
        for dict in geocoding['results'][0]['address_components']:
            if 'administrative_area_level_2' in dict['types']:
                county = dict['long_name']
            if 'postal_code' in dict['types']:
                zipcode = dict['long_name']
        # Add zipcode and county to dataframe
        stations.loc[stations['COORD'] == coord, 'COUNTY'] = county
        stations.loc[stations['COORD'] == coord, 'ZIP'] = zipcode

        print(f'Added {coord} to DataFrame.')
    
    except:
        # Store any problematic coordinates
        missing_dict = {}
        missing_dict['JSON'] = geocoding
        missing_dict['COORD'] = coord
        missing_stations_2.append(missing_dict)
        print(f'Could not add {coord} to DataFrame')

print('Finished Processing')

Added 35.5453,-82.6987 to DataFrame.
Added 34.15,-85.6846 to DataFrame.
Finished Processing


In [52]:
# Preview DataFrame
stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV,"39.5064,-78.3158",Alamance County,27258
1,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA,"38.2022,-77.3747",Avery County,28604
2,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA,"37.3451,-77.9781",Pitt County,27834
3,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA,"36.5869,-79.3886",New Hanover County,28409
4,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV,"39.6203,-79.9698",Beaufort County,27889
...,...,...,...,...,...,...,...,...
927,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL,"32.5811,-88.1897",Sumter County,35470
928,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY,"37.7858,-84.7077",Mercer County,40330
929,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY,"37.8194,-85.3847",Nelson County,40004
930,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY,"36.8276,-85.8975",Barren County,42141


In [53]:
# Check that there are no missing values
stations.count()

STATION      932
NAME         932
LATITUDE     932
LONGITUDE    932
STATE        932
COORD        932
COUNTY       932
ZIP          932
dtype: int64

In [54]:
# Store station metadata in csv file
stations.to_csv('stations-metadata.csv', index=False)

In [15]:
# Merge station metadata with initial data
stations_subset = stations[['NAME', 'COUNTY', 'ZIP']]
df_clean_3 = pd.merge(df_clean_2, stations_subset, on='NAME')
df_clean_3

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE,COUNTY,ZIP
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV,Alamance County,27258
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV,Alamance County,27258
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV,Alamance County,27258
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV,Alamance County,27258
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV,Alamance County,27258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341114,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-27,38.3,,47.8,,28.7,,TN,Rutherford County,37060
341115,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-28,38.1,,47.7,,28.6,,TN,Rutherford County,37060
341116,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-29,38.0,,47.6,,28.5,,TN,Rutherford County,37060
341117,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-30,37.9,,47.4,,28.4,,TN,Rutherford County,37060


In [16]:
# Store cleaned dataset in csv file
df_clean_3.to_csv('se-weather-cleaned.csv', index=False)