## Southeast USA Daily Normals
Data Source: https://www.ncdc.noaa.gov/cdo-web/search?datasetid=NORMAL_DLY

References
* https://stackoverflow.com/questions/31511997/pandas-dataframe-replace-all-values-in-a-column-based-on-condition
* https://jakevdp.github.io/WhirlwindTourOfPython/14-strings-and-regular-expressions.html
* https://towardsdatascience.com/reverse-geocoding-in-python-a915acf29eb6

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import re
import requests
import json
import time

# Google developer API key
from config import gkey

### Functions

In [2]:
# This function takes in the station name as a string and extracts the state that the station is in
# This function returns the state two-letter code as a string
def findState(string):
    regex = re.compile(',\s([A-Z][A-Z])')
    state = regex.findall(string)
    return state[0]

### Cleaning Dataset(s)

In [3]:
# Read in first dataset
df1 = pd.read_csv('daily-normals-SE-1.csv')
df1.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-01,,,,,,
1,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-02,,,,,,
2,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-03,,,,,,
3,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-04,,,,,,
4,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-05,,,,,,


In [4]:
# Read in second dataset
df2 = pd.read_csv('daily-normals-SE-2.csv')
df2.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-01,,,,,,
1,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-02,,,,,,
2,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-03,,,,,,
3,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-04,,,,,,
4,USC00312827,"ENFIELD, NC US",36.1686,-77.675,33.5,01-05,,,,,,


In [5]:
# Read in third dataset
df3 = pd.read_csv('daily-normals-SE-3.csv')
df3.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-01,36.0,10.1,45.7,11.0,26.4,10.9
1,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-02,36.0,10.1,45.6,11.0,26.3,10.9
2,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-03,35.9,10.1,45.6,10.9,26.2,10.9
3,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-04,35.9,10.0,45.6,10.9,26.2,10.9
4,USC00406271,"MORRISTOWN RADIO WCR, TN US",36.2067,-83.3325,409.7,01-05,35.8,10.0,45.6,10.9,26.1,10.9


In [6]:
# Combine datasets
df = pd.concat([df1, df2, df3])
df

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
0,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-01,,,,,,
1,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-02,,,,,,
2,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-03,,,,,,
3,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-04,,,,,,
4,USC00441955,"CONCORD 4 SSW, VA US",37.2819,-78.9591,248.4,01-05,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
218103,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-27,,,,,,
218104,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-28,,,,,,
218105,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-29,,,,,,
218106,USC00017947,"SULLIGENT, AL US",33.8998,-88.1326,106.7,12-30,,,,,,


In [7]:
# Check for missing values
df.count()

STATION            418300
NAME               418300
LATITUDE           418300
LONGITUDE          418300
ELEVATION          418300
DATE               418300
DLY-TAVG-NORMAL    342576
DLY-TAVG-STDDEV    261690
DLY-TMAX-NORMAL    342576
DLY-TMAX-STDDEV    261690
DLY-TMIN-NORMAL    342576
DLY-TMIN-STDDEV    261690
dtype: int64

In [8]:
# Drop all rows that are missing daily normals (avg, max, min)
df_clean_1 = df.dropna(subset=['DLY-TAVG-NORMAL', 'DLY-TMAX-NORMAL', 'DLY-TMIN-NORMAL'])

In [9]:
# Check for additional missing values
df_clean_1.count()

STATION            342576
NAME               342576
LATITUDE           342576
LONGITUDE          342576
ELEVATION          342576
DATE               342576
DLY-TAVG-NORMAL    342576
DLY-TAVG-STDDEV    261690
DLY-TMAX-NORMAL    342576
DLY-TMAX-STDDEV    261690
DLY-TMIN-NORMAL    342576
DLY-TMIN-STDDEV    261690
dtype: int64

In [10]:
df_clean_1

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4
367,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5
368,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6
369,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6
370,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7
...,...,...,...,...,...,...,...,...,...,...,...,...
217371,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-27,38.3,,47.8,,28.7,
217372,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-28,38.1,,47.7,,28.6,
217373,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-29,38.0,,47.6,,28.5,
217374,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-30,37.9,,47.4,,28.4,


### Adding State Column

In [11]:
# Pull state out of station name and add to state column
df_clean_2 = df_clean_1.copy()
df_clean_2['STATE'] = df_clean_2['NAME'].apply(findState)
df_clean_2

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV
367,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV
368,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV
369,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV
370,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217371,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-27,38.3,,47.8,,28.7,,TN
217372,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-28,38.1,,47.7,,28.6,,TN
217373,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-29,38.0,,47.6,,28.5,,TN
217374,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-30,37.9,,47.4,,28.4,,TN


In [12]:
# Review sample to verify accurate state assignment
df_test_states = df_clean_2.sample(20)
df_test_states

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE
182616,USC00401310,"BYRDSTOWN, TN US",36.58028,-85.12556,268.2,01-05,35.2,11.5,45.5,12.8,25.0,11.7,TN
170322,USC00091340,"BRUNSWICK, GA US",31.1681,-81.5022,4.0,06-01,78.5,4.2,87.8,5.5,69.2,4.4,GA
61208,USC00463215,"FRANKLIN 2 NE, WV US",38.6756,-79.3092,579.1,03-29,46.3,9.2,59.9,11.8,32.7,9.2,WV
35442,USC00317516,"ROXBORO 7 ESE, NC US",36.3464,-78.8858,216.4,11-02,52.4,7.9,66.7,9.0,38.0,9.4,NC
121619,USW00003804,"PARKERSBURG AIRPORT, WV US",39.2,-81.27,253.3,04-23,56.5,8.7,68.7,10.5,44.2,8.9,WV
118190,USC00403679,"GREENEVILLE EXPERIMENTAL STATION, TN US",36.1056,-82.8436,402.3,12-20,37.5,10.3,49.0,11.5,25.9,11.1,TN
163855,USC00404858,"KINGSPORT, TN US",36.5194,-82.5275,391.4,09-29,64.0,6.3,76.5,6.9,51.4,7.7,TN
82758,USC00409709,"WHITE HOUSE, TN US",36.4511,-86.6455,249.9,02-22,40.9,,51.6,,30.2,,TN
148695,USC00013645,"HAMILTON 3 S, AL US",34.0966,-87.9913,132.6,04-27,62.4,7.2,78.0,8.0,46.8,9.0,AL
115334,USC00467649,"ROCK CAVE 2 NE, WV US",38.8561,-80.3074,533.1,02-20,33.5,11.3,42.1,13.1,24.8,11.1,WV


In [13]:
# Determine states included in dataset
states = df_clean_2['STATE'].unique()
states

array(['WV', 'VA', 'SC', 'MD', 'NC', 'TN', 'GA', 'AL', 'KY', 'IN', 'IL'],
      dtype=object)

In [14]:
# Drop rows associated with states not in the Southwest
for state in ['MD', 'IN', 'IL']:
    df_clean_2.drop(df_clean_2[df_clean_2['STATE'] == state].index, inplace=True)

# Check that unwanted states were dropped
df_clean_2['STATE'].unique()

array(['WV', 'VA', 'SC', 'NC', 'TN', 'GA', 'AL', 'KY'], dtype=object)

## Find Counties and Zip Codes for Stations

In [15]:
# Pull out dataframe of stations metadata
stations = df_clean_2[['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'STATE']].drop_duplicates()
stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV
732,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA
1098,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA
1464,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA
2196,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV
...,...,...,...,...,...
214083,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL
215180,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY
215912,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY
216278,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY


In [16]:
# Create column that has coordinates combined in a string
stations['COORD'] = stations['LATITUDE'].map(str) + ',' + stations['LONGITUDE'].map(str)
stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV,"39.5064,-78.3158"
732,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA,"38.2022,-77.3747"
1098,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA,"37.3451,-77.9781"
1464,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA,"36.5869,-79.3886"
2196,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV,"39.6203,-79.9698"
...,...,...,...,...,...,...
214083,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL,"32.5811,-88.1897"
215180,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY,"37.7858,-84.7077"
215912,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY,"37.8194,-85.3847"
216278,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY,"36.8276,-85.8975"


In [17]:
# Create columns for county and zip code
stations['COUNTY'] = ''
stations['ZIP'] = ''

In [43]:
# Empty list for any stations missing
missing_stations = []

# Create columns for county and zip code
stations_others = stations.loc[stations['COUNTY'] == '']

In [44]:
# Use reverse geocoding using Google's Geocoding API to find the zipcodes and counties
for index, row in stations_others.iterrows():
    coord = row["COORD"]
    # Assemble url and make API call based on coordinates
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={coord}&key={gkey}'
    response = requests.get(url)
    geocoding = response.json()
    
    try:
        # Find zipcode and county within JSON response
        for dict in geocoding['results'][0]['address_components']:
            if 'administrative_area_level_2' in dict['types']:
                county = dict['long_name']
            if 'postal_code' in dict['types']:
                zipcode = dict['long_name']
        # Add zipcode and county to dataframe
        stations.loc[stations['COORD'] == coord, 'COUNTY'] = county
        stations.loc[stations['COORD'] == coord, 'ZIP'] = zipcode

        print(f'Added {coord} to DataFrame.')
        time.sleep(1)
    
    except:
        # Store any problematic coordinates
        missing_dict = {}
        missing_dict['JSON'] = geocoding
        missing_dict['COORD'] = coord
        missing_stations.append(missing_dict)
        print(f'Could not add {coord} to DataFrame')

print('Finished Processing')

Added 39.5064,-78.3158 to DataFrame.
Added 38.2022,-77.3747 to DataFrame.
Added 37.3451,-77.9781 to DataFrame.
Added 36.5869,-79.3886 to DataFrame.
Added 39.6203,-79.9698 to DataFrame.
Added 39.3419,-79.6697 to DataFrame.
Added 38.4014,-82.5266 to DataFrame.
Added 38.1935,-81.3701 to DataFrame.
Added 37.2889,-82.2928 to DataFrame.
Added 38.5459,-78.0981 to DataFrame.
Added 37.4255,-77.8915 to DataFrame.
Added 38.3311,-79.6586 to DataFrame.
Added 37.5894,-81.0925 to DataFrame.
Added 36.81667,-76.03333 to DataFrame.
Added 36.695,-76.13556 to DataFrame.
Added 38.1873,-80.1336 to DataFrame.
Added 38.3138,-81.7186 to DataFrame.
Added 37.1453,-82.4911 to DataFrame.
Added 36.6954,-78.8807 to DataFrame.
Added 36.8628,-82.7711 to DataFrame.
Added 37.1756,-78.2786 to DataFrame.
Added 37.0556,-80.7842 to DataFrame.
Added 39.0046,-80.47399999999999 to DataFrame.
Added 38.2875,-77.4507 to DataFrame.
Added 38.8172,-81.7119 to DataFrame.
Added 37.6731,-82.2762 to DataFrame.
Added 36.6002,-78.3011 to 

Added 32.5575,-82.9036 to DataFrame.
Added 32.8711,-86.1758 to DataFrame.
Added 35.0711,-89.4117 to DataFrame.
Added 31.1902,-84.2036 to DataFrame.
Added 34.7582,-84.7642 to DataFrame.
Added 35.414,-86.8086 to DataFrame.
Added 35.0311,-85.2014 to DataFrame.
Added 34.8879,-83.3966 to DataFrame.
Added 34.5686,-85.6063 to DataFrame.
Added 37.16861,-84.93916999999999 to DataFrame.
Added 36.6736,-84.4791 to DataFrame.
Added 33.3644,-81.9633 to DataFrame.
Added 32.13,-81.21 to DataFrame.
Added 36.5877,-86.5258 to DataFrame.
Added 35.2803,-85.2414 to DataFrame.
Added 32.2003,-83.2058 to DataFrame.
Added 34.51,-87.7319 to DataFrame.
Added 36.471,-86.8415 to DataFrame.
Added 35.0564,-89.9865 to DataFrame.
Added 32.411,-87.0144 to DataFrame.
Added 35.8181,-83.9858 to DataFrame.
Added 38.3636,-85.4186 to DataFrame.
Added 37.0563,-88.7744 to DataFrame.
Added 36.11889,-86.68916999999999 to DataFrame.
Added 35.2191,-84.7921 to DataFrame.
Added 34.7441,-87.5997 to DataFrame.
Added 32.2997,-86.4075 to

Added 36.2872,-88.2958 to DataFrame.
Added 37.3475,-87.5238 to DataFrame.
Added 37.6497,-86.4308 to DataFrame.
Added 36.4259,-84.94200000000001 to DataFrame.
Added 33.3952,-87.0077 to DataFrame.
Added 31.182,-87.439 to DataFrame.
Added 35.7635,-86.9321 to DataFrame.
Added 35.1615,-86.0315 to DataFrame.
Added 38.0945,-84.7465 to DataFrame.
Added 35.9833,-83.2008 to DataFrame.
Added 32.6089,-85.0756 to DataFrame.
Added 36.3444,-88.8636 to DataFrame.
Added 34.5528,-86.445 to DataFrame.
Added 34.1736,-86.8133 to DataFrame.
Added 37.495,-85.1516 to DataFrame.
Added 36.7458,-86.2258 to DataFrame.
Added 37.85889,-87.40861 to DataFrame.
Added 30.5468,-87.8807 to DataFrame.
Added 31.8709,-85.4501 to DataFrame.
Added 34.8544,-83.9444 to DataFrame.
Added 33.8705,-83.1135 to DataFrame.
Added 35.0097,-85.3444 to DataFrame.
Added 34.4546,-85.39 to DataFrame.
Added 37.2504,-86.2325 to DataFrame.
Added 35.5567,-87.5414 to DataFrame.
Added 34.3328,-84.4703 to DataFrame.
Added 37.955999999999996,-86.116

Added 35.49222,-86.4775 to DataFrame.
Added 34.5859,-83.7658 to DataFrame.
Added 35.7047,-86.4869 to DataFrame.
Added 31.615,-85.0494 to DataFrame.
Added 33.3329,-83.6975 to DataFrame.
Added 31.03833,-85.87083 to DataFrame.
Added 31.3781,-82.1292 to DataFrame.
Added 36.1422,-82.4261 to DataFrame.
Added 33.988,-84.7475 to DataFrame.
Added 31.3972,-81.2811 to DataFrame.
Added 34.2834,-86.9142 to DataFrame.
Added 35.2622,-88.9891 to DataFrame.
Added 31.8453,-83.9409 to DataFrame.
Added 34.165,-84.73 to DataFrame.
Added 35.5983,-85.1939 to DataFrame.
Added 36.0968,-86.1397 to DataFrame.
Added 35.593,-88.9167 to DataFrame.
Added 32.6847,-83.6527 to DataFrame.
Added 36.2836,-88.7063 to DataFrame.
Added 34.4758,-84.4461 to DataFrame.
Added 33.2119,-87.6161 to DataFrame.
Added 36.9647,-86.4238 to DataFrame.
Added 31.767220000000002,-84.79306 to DataFrame.
Added 37.2791,-86.2491 to DataFrame.
Added 30.5654,-87.7018 to DataFrame.
Added 36.8825,-83.8819 to DataFrame.
Added 36.4583,-86.324 to Data

In [45]:
# Check if any json requests were unsuccessful
missing_stations

[]

In [46]:
# Check for any empty county/zip entries
stations.loc[stations["COUNTY"] == ""]

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP


In [36]:
# Retry API calls which didn't work

# Create empty list to store any API calls that don't work
missing_stations_2 = []

for entry in missing_stations:
    coord = entry['COORD']
    # Assemble url and make API call based on coordinates
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={coord}&key={gkey}'
    response = requests.get(url)
    geocoding = response.json()
    
    try:
        # Find zipcode and county within JSON response
        for dict in geocoding['results'][0]['address_components']:
            if 'administrative_area_level_2' in dict['types']:
                county = dict['long_name']
            if 'postal_code' in dict['types']:
                zipcode = dict['long_name']
        # Add zipcode and county to dataframe
        stations.loc[stations['COORD'] == coord, 'COUNTY'] = county
        stations.loc[stations['COORD'] == coord, 'ZIP'] = zipcode

        print(f'Added {coord} to DataFrame.')
    
    except:
        # Store any problematic coordinates
        missing_dict = {}
        missing_dict['JSON'] = geocoding
        missing_dict['COORD'] = coord
        missing_stations_2.append(missing_dict)
        print(f'Could not add {coord} to DataFrame')

print('Finished Processing')

Added 33.46167,-80.85806 to DataFrame.
Added 34.7055,-79.5683 to DataFrame.
Added 33.3169,-79.3227 to DataFrame.
Finished Processing


In [47]:
# Preview DataFrame
stations.loc[stations['COUNTY'] != '']

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP
366,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV,"39.5064,-78.3158",Morgan County,25411
732,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA,"38.2022,-77.3747",Caroline County,22580
1098,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA,"37.3451,-77.9781",Amelia County,23002
1464,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA,"36.5869,-79.3886",Amelia County,24541
2196,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV,"39.6203,-79.9698",Monongalia County,26501
...,...,...,...,...,...,...,...,...
214083,USC00014798,"LIVINGSTON, AL US",32.5811,-88.1897,AL,"32.5811,-88.1897",Sumter County,35470
215180,USC00152214,"DIX DAM, KY US",37.7858,-84.7077,KY,"37.7858,-84.7077",Mercer County,40330
215912,USC00150397,"BARDSTOWN 5 E, KY US",37.8194,-85.3847,KY,"37.8194,-85.3847",Nelson County,40004
216278,USC00152575,"ETOILE, KY US",36.8276,-85.8975,KY,"36.8276,-85.8975",Barren County,42141


In [48]:
missing_stations_2

[]

In [49]:
# Check that there are no missing values
stations.count()

STATION      932
NAME         932
LATITUDE     932
LONGITUDE    932
STATE        932
COORD        932
COUNTY       932
ZIP          932
dtype: int64

In [50]:
# Store station metadata in csv file
stations.to_csv('../data/stations-metadata.csv', index=False)

In [51]:
# Merge station metadata with initial data
stations_subset = stations[['NAME', 'COUNTY', 'ZIP']]
df_clean_3 = pd.merge(df_clean_2, stations_subset, on='NAME')
df_clean_3

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE,COUNTY,ZIP
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV,Morgan County,25411
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV,Morgan County,25411
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV,Morgan County,25411
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV,Morgan County,25411
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV,Morgan County,25411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341114,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-27,38.3,,47.8,,28.7,,TN,Rutherford County,37060
341115,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-28,38.1,,47.7,,28.6,,TN,Rutherford County,37060
341116,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-29,38.0,,47.6,,28.5,,TN,Rutherford County,37060
341117,USC00402711,"EAGLEVILLE 1 SW, TN US",35.7285,-86.6435,243.8,12-30,37.9,,47.4,,28.4,,TN,Rutherford County,37060


In [52]:
# Store cleaned dataset in csv file
df_clean_3.to_csv('../data/se-weather-cleaned.csv', index=False)

## Formatting for MongoDB

In [53]:
weather = pd.read_csv("../data/se-weather-cleaned.csv")
weather.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE,COUNTY,ZIP
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV,Morgan County,25411
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV,Morgan County,25411
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV,Morgan County,25411
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV,Morgan County,25411
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV,Morgan County,25411


In [54]:
weather.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE',
       'DLY-TAVG-NORMAL', 'DLY-TAVG-STDDEV', 'DLY-TMAX-NORMAL',
       'DLY-TMAX-STDDEV', 'DLY-TMIN-NORMAL', 'DLY-TMIN-STDDEV', 'STATE',
       'COUNTY', 'ZIP'],
      dtype='object')

In [55]:
stations = pd.read_csv("../data/stations-metadata.csv")
stations.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,WV,"39.5064,-78.3158",Morgan County,25411
1,USC00442009,"CORBIN, VA US",38.2022,-77.3747,VA,"38.2022,-77.3747",Caroline County,22580
2,USC00440187,"AMELIA COURTHOUSE 1, VA US",37.3451,-77.9781,VA,"37.3451,-77.9781",Amelia County,23002
3,USC00442245,"DANVILLE, VA US",36.5869,-79.3886,VA,"36.5869,-79.3886",Amelia County,24541
4,USC00466212,"MORGANTOWN LOCK AND DAM, WV US",39.6203,-79.9698,WV,"39.6203,-79.9698",Monongalia County,26501


In [56]:
stations.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'STATE', 'COORD', 'COUNTY',
       'ZIP'],
      dtype='object')

In [57]:
zips = pd.read_csv("../data/zipcodes_master.csv")
zips.head()

Unnamed: 0,ZIP,STATE,LATITUDE,LONGITUDE,CLOSEST-STATION
0,26031,WV,40.01,-80.73,"WHEELING, WV US"
1,26032,WV,40.2,-80.56,"WHEELING OHIO CO AIRPORT, WV US"
2,26033,WV,39.82,-80.57,"MOUNDSVILLE, WV US"
3,26034,WV,40.61,-80.56,"WHEELING OHIO CO AIRPORT, WV US"
4,26035,WV,40.34,-80.55,"WHEELING OHIO CO AIRPORT, WV US"


In [58]:
zips_new = zips.rename(columns={'Zipcode': 'ZIP', 'State': 'STATE', 'Lat': 'LATITUDE', 'Long': 'LONGITUDE', 'Closest Weather Station': 'CLOSEST-STATION'})

In [59]:
zips_new.columns

Index(['ZIP', 'STATE', 'LATITUDE', 'LONGITUDE', 'CLOSEST-STATION'], dtype='object')

In [60]:
zips_new.to_csv("../data/zipcodes_master.csv", index=False)

## Add a faux year to each calendar date 
*Allows for date filtering in MongoDB

In [14]:
# Read in full dataset
df = pd.read_csv("../data/se-weather-cleaned.csv")
df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE,COUNTY,ZIP,DATE_FILTER
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV,Morgan County,25411,01-01-2008
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV,Morgan County,25411,01-02-2008
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV,Morgan County,25411,01-03-2008
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV,Morgan County,25411,01-04-2008
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV,Morgan County,25411,01-05-2008


In [15]:
# This function takes in the station name as a string and extracts the state that the station is in
# This function returns the state two-letter code as a string
def createDate(string):
    date = string + '-2008'
    return date

In [16]:
df['DATE_FILTER'] = df['DATE'].apply(createDate)

In [17]:
df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE,COUNTY,ZIP,DATE_FILTER
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV,Morgan County,25411,01-01-2008
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV,Morgan County,25411,01-02-2008
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV,Morgan County,25411,01-03-2008
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV,Morgan County,25411,01-04-2008
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV,Morgan County,25411,01-05-2008


In [18]:
# Store cleaned dataset in csv file
df.to_csv('../data/se-weather-cleaned.csv', index=False)

## Make dataset leaner to save space in MongoDB

In [2]:
# Read in full dataset
df = pd.read_csv("../data/se-weather-cleaned.csv")
df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TAVG-STDDEV,DLY-TMAX-NORMAL,DLY-TMAX-STDDEV,DLY-TMIN-NORMAL,DLY-TMIN-STDDEV,STATE,COUNTY,ZIP,DATE_FILTER
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,10.3,38.6,11.8,21.0,10.4,WV,Morgan County,25411,01-01-2008
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,10.3,38.5,11.8,20.9,10.5,WV,Morgan County,25411,01-02-2008
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,10.4,38.4,11.9,20.9,10.6,WV,Morgan County,25411,01-03-2008
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,10.4,38.4,11.9,20.8,10.6,WV,Morgan County,25411,01-04-2008
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,10.5,38.3,11.9,20.8,10.7,WV,Morgan County,25411,01-05-2008


In [3]:
del df['DLY-TAVG-STDDEV']
del df['DLY-TMAX-STDDEV']
del df['DLY-TMIN-STDDEV']
df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DLY-TAVG-NORMAL,DLY-TMAX-NORMAL,DLY-TMIN-NORMAL,STATE,COUNTY,ZIP,DATE_FILTER
0,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-01,29.8,38.6,21.0,WV,Morgan County,25411,01-01-2008
1,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-02,29.7,38.5,20.9,WV,Morgan County,25411,01-02-2008
2,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-03,29.7,38.4,20.9,WV,Morgan County,25411,01-03-2008
3,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-04,29.6,38.4,20.8,WV,Morgan County,25411,01-04-2008
4,USC00461324,"CACAPON STATE PARK 2, WV US",39.5064,-78.3158,289.6,01-05,29.5,38.3,20.8,WV,Morgan County,25411,01-05-2008


In [4]:
# Store cleaned dataset in csv file
df.to_csv('../data/se-weather-cleaned-lean.csv', index=False)

## Inaccurate Geocoding (not in deployed version)

In [None]:
# Dependencies
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [17]:
# Set up reverse geocoding function
locator = Nominatim(user_agent='myGeocoder', timeout=10)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)

# Create columns for county and zip code
stations['COUNTY'] = ''
stations['ZIP'] = ''

# Pull county and zip code for each station
for index, row in stations.iterrows():
    try:
        coord = row['COORD']
        address = rgeocode(coord)
        county = address.raw['address']['county']
        zipcode = address.raw['address']['postcode']
        stations.loc[index, 'COUNTY'] = county
        stations.loc[index, 'ZIP'] = zipcode
        print(f"{row['NAME']} complete.")
    except:
        print(f"Cannot find {row['NAME']}.")
    
print('Reverse geocoding complete.')

Cannot find CACAPON STATE PARK 2, WV US.
CORBIN, VA US complete.
AMELIA COURTHOUSE 1, VA US complete.
Cannot find DANVILLE, VA US.
MORGANTOWN LOCK AND DAM, WV US complete.
ROWLESBURG 1, WV US complete.
HUNTINGTON SWG PLANT, WV US complete.
LONDON LOCKS, WV US complete.
BREAKS INTERSTATE PARK, VA US complete.
BOSTON 4 SE, VA US complete.
AMELIA 8 NE, VA US complete.
CHARLESTON INTL. AIRPORT, SC US complete.
MUSTOE 1 SW, VA US complete.
SANDHILL RESEARCH, SC US complete.
FLAT TOP, WV US complete.
OCEANA NAS, VA US complete.
GREENVILLE DOWNTOWN AIRPORT, SC US complete.
COLUMBIA METROPOLITAN AIRPORT, SC US complete.
FENTRESS NAVAL AUXILIARY FIELD, VA US complete.
BUCKEYE, WV US complete.
CHARLESTON WSFO, WV US complete.
CLINTWOOD 1 W, VA US complete.
SOUTH BOSTON, VA US complete.
BIG STONE GAP, VA US complete.
GREENBAY 3 NE, VA US complete.
PULASKI 2 E, VA US complete.
STONEWALL JACKSON DAM, WV US complete.
FREDERICKSBURG SEWAGE, VA US complete.
RIPLEY, WV US complete.
WILLIAMSON, WV US co

RADFORD 3 N, VA US complete.
LOST RIVER, WV US complete.
MARTINSVILLE FILTER PLANT, VA US complete.
ANSTEAD HAWKS NEST STATE PARK, WV US complete.
Cannot find RICHWOOD 1 SSE, WV US.
Cannot find JOCASSEE 8 WNW, SC US.
RICHLANDS, VA US complete.
STAFFORDSVILLE 3 ENE, VA US complete.
MORGANTOWN HART FIELD, WV US complete.
MARTINSBURG EASTERN WEST VIRGINIA REGIONAL AIRPORT, WV US complete.
LYNCHBURG INTERNATIONAL AIRPORT, VA US complete.
ORANGEBURG 2, SC US complete.
CHATHAM, VA US complete.
BLACKSBURG NATIONAL WEATHER SERVICE OFFICE, VA US complete.
ROCK CAVE 2 NE, WV US complete.
HAMLIN, WV US complete.
PENNINGTON GAP, VA US complete.
FARMVILLE 2 N, VA US complete.
CROZIER, VA US complete.
WAKEFIELD 1 NW, VA US complete.
STONY CREEK 2 N, VA US complete.
LEWISBURG 3 N, WV US complete.
SUMMERSVILLE LAKE, WV US complete.
WINTHROP UNIVERSITY, SC US complete.
WISE 1 SE, VA US complete.
RIDGEVILLE, SC US complete.
MCCORMICK, SC US complete.
YEMASSEE 1 N, SC US complete.
Cannot find LANGLEY AIR

MOUNTAIN CITY 2, TN US complete.
Cannot find HUNTSVILLE INTERNATIONAL AIRPORT JONES FIELD, AL US.
MONTEREY, TN US complete.
QUITMAN 2 NW, GA US complete.
CLARKESVILLE, GA US complete.
LAWRENCEBURG FILTER PLANT, TN US complete.
COLLIERVILLE, TN US complete.
TROY, AL US complete.
SAVANNAH 6 SW, TN US complete.
THORSBY EXPERIMENTAL STATION, AL US complete.
ELIZABETHTON, TN US complete.
LONDON CORBIN AIRPORT, KY US complete.
COLUMBIA 3 WNW, TN US complete.
STANTON 2 W, KY US complete.
MANCHESTER 4 W, KY US complete.
WAYCROSS WARE CO AIRPORT, GA US complete.
OAK RIDGE ATDD, TN US complete.
CODEN, AL US complete.
MONTEAGLE, TN US complete.
PRINCETON 1 SE, KY US complete.
CROSSVILLE MEMORIAL AIRPORT, TN US complete.
ALBANY SW GEORGIA REGIONAL AIRPORT, GA US complete.
Cannot find ASHBURN 3 ENE, GA US.
PARSONS WATER PLANT, TN US complete.
WARTRACE 6 E, TN US complete.
WARNER PARK, TN US complete.
CEDARTOWN, GA US complete.
ANNISTON METROPOLITAN AIRPORT, AL US complete.
Cannot find ALMA BACON CO

CORNELIA, GA US complete.
BEAVER DAM, KY US complete.
HENDERSON 8 SSW, KY US complete.
FARMERS 2 S, KY US complete.
MC MINNVILLE, TN US complete.
GLADEVILLE, TN US complete.
HAZLEHURST, GA US complete.
DICKSON, TN US complete.
LEITCHFIELD 2 N, KY US complete.
Cannot find FOLKSTON 9 SW, GA US.
Cannot find COOKEVILLE, TN US.
BRENTWOOD, TN US complete.
DAYTON 2 SE, TN US complete.
LYONS, GA US complete.
Cannot find JESUP 8 S, GA US.
HELENA, AL US complete.
MABLETON 1 N, GA US complete.
Cannot find CENTRE, AL US.
HUNTINGDON WATER PLA, TN US complete.
ALEXANDER CITY, AL US complete.
BROWNSVILLE, TN US complete.
CORDELE, GA US complete.
SAUTEE 3 W, GA US complete.
LOUISVILLE WEATHER FORECAST OFFICE, KY US complete.
GERMANTOWN 4 SE, TN US complete.
HEIDELBERG 2 N, KY US complete.
ROCK ISLAND STATE PARK, TN US complete.
Cannot find TOWNSEND 5 S, TN US.
DECATUR 5 SE, AL US complete.
TOCCOA, GA US complete.
HANCEVILLE, AL US complete.
LAFAYETTE 2 W, AL US complete.
HEFLIN, AL US complete.
TROY 2

In [17]:
# Extract out stations that are missing county and zip code
missing_geocode = stations.loc[stations['COUNTY'].isnull()]
missing_geocode

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,STATE,COORD,COUNTY,ZIP
47,USC00440720,"BIG MEADOWS, VA US",38.52160,-78.43550,VA,"38.5216,-78.4355",,
52,USC00461900,"COOPERS ROCK STATE FOREST, WV US",39.67722,-79.77194,WV,"39.67722,-79.77194",,
55,USC00464971,"KUMBRABOW, WV US",38.63110,-80.08560,WV,"38.6311,-80.0856",,
56,USC00443310,"GATHRIGHT DAM, VA US",37.94580,-79.95420,VA,"37.9458,-79.9542",,
62,USC00468662,"SUTTON LAKE, WV US",38.66110,-80.69700,WV,"38.6611,-80.697",,
...,...,...,...,...,...,...,...,...
912,USW00003811,"JACKSON MCKELLAR SIPES REGIONAL AIRPORT, TN US",35.59300,-88.91670,TN,"35.593,-88.9167",,
917,USW00093808,"BOWLING GREEN WARREN CO AIRPORT, KY US",36.96470,-86.42380,KY,"36.9647,-86.4238",,
920,USC00016988,"ROBERTSDALE, AL US",30.56540,-87.70180,AL,"30.5654,-87.7018",,
925,USC00403938,"HARTSVILLE, TN US",36.37556,-86.18083,TN,"36.37556,-86.18083",,
