# Determine Zipcodes Representation Dataset
Data Source: http://federalgovernmentzipcodes.us/download.html

References
* https://www.kite.com/python/answers/how-to-filter-a-pandas-dataframe-with-a-list-by-%60in%60-or-%60not-in%60-in-python
* https://stackoverflow.com/questions/41336756/find-the-closest-latitude-and-longitude

In [10]:
# Dependencies
import pandas as pd
from math import cos, asin, sqrt

In [2]:
# Read in datasets
zipcodes = pd.read_csv('all-us-zipcodes.csv')
stations = pd.read_csv('stations-metadata.csv')
zipcodes

Unnamed: 0,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages
0,705,STANDARD,AIBONITO,PR,PRIMARY,18.14,-66.26,NA-US-PR-AIBONITO,False,,,
1,610,STANDARD,ANASCO,PR,PRIMARY,18.28,-67.14,NA-US-PR-ANASCO,False,,,
2,611,PO BOX,ANGELES,PR,PRIMARY,18.28,-66.79,NA-US-PR-ANGELES,False,,,
3,612,STANDARD,ARECIBO,PR,PRIMARY,18.45,-66.73,NA-US-PR-ARECIBO,False,,,
4,601,STANDARD,ADJUNTAS,PR,PRIMARY,18.16,-66.72,NA-US-PR-ADJUNTAS,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
42517,34009,MILITARY,APO,AA,PRIMARY,,,SA-GY-NEW HORIZON-GUYANA,True,,,
42518,34010,MILITARY,APO,AA,PRIMARY,,,CA-HN-NEW HORIZON LA CEIBA - COMAYAGUA,True,,,
42519,34001,MILITARY,APO,AA,PRIMARY,,,CA-NI-JTF JUNTOS PODEMOS BASE CAMP,True,,,
42520,34071,MILITARY,APO,AA,PRIMARY,,,SA-PY-JTF NEW HORIZON-CONCEPCION,True,,,


In [3]:
zipcodes['ZipCodeType'].unique()

array(['STANDARD', 'PO BOX', 'UNIQUE', 'MILITARY'], dtype=object)

In [4]:
zip_clean_1 = zipcodes.loc[zipcodes['ZipCodeType'] == 'STANDARD']
zip_clean_1

Unnamed: 0,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages
0,705,STANDARD,AIBONITO,PR,PRIMARY,18.14,-66.26,NA-US-PR-AIBONITO,False,,,
1,610,STANDARD,ANASCO,PR,PRIMARY,18.28,-67.14,NA-US-PR-ANASCO,False,,,
3,612,STANDARD,ARECIBO,PR,PRIMARY,18.45,-66.73,NA-US-PR-ARECIBO,False,,,
4,601,STANDARD,ADJUNTAS,PR,PRIMARY,18.16,-66.72,NA-US-PR-ADJUNTAS,False,,,
6,602,STANDARD,AGUADA,PR,PRIMARY,18.38,-67.18,NA-US-PR-AGUADA,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
42502,98442,STANDARD,TACOMA,WA,PRIMARY,47.14,-122.43,NA-US-WA-TACOMA,True,,,
42507,98492,STANDARD,TACOMA,WA,PRIMARY,47.12,-122.55,NA-US-WA-TACOMA,True,,,
42509,98929,STANDARD,NACHES,WA,PRIMARY,46.74,-121.14,NA-US-WA-NACHES,True,,,
42510,99165,STANDARD,PULLMAN,WA,PRIMARY,46.73,-117.17,NA-US-WA-PULLMAN,True,,,


In [5]:
# Determine list of states of interest
states = stations['STATE'].unique()
states

array(['WV', 'VA', 'SC', 'NC', 'TN', 'GA', 'AL', 'KY'], dtype=object)

In [6]:
# Filter zipcode list to only those in states of interest
zip_clean_2 = zip_clean_1[zip_clean_1['State'].isin(states)]
zip_clean_2.head()

Unnamed: 0,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages
6440,26031,STANDARD,BENWOOD,WV,PRIMARY,40.01,-80.73,NA-US-WV-BENWOOD,False,944.0,1545.0,23193242.0
6441,26032,STANDARD,BETHANY,WV,PRIMARY,40.2,-80.56,NA-US-WV-BETHANY,False,279.0,485.0,8190519.0
6442,26033,STANDARD,CAMERON,WV,PRIMARY,39.82,-80.57,NA-US-WV-CAMERON,False,1314.0,2388.0,40507175.0
6443,26034,STANDARD,CHESTER,WV,PRIMARY,40.61,-80.56,NA-US-WV-CHESTER,False,2217.0,3931.0,68828644.0
6444,26035,STANDARD,COLLIERS,WV,PRIMARY,40.34,-80.55,NA-US-WV-COLLIERS,False,1197.0,2181.0,37341493.0


In [7]:
# Select columns of interest
zip_clean_3 = zip_clean_2[['Zipcode', 'State', 'Lat', 'Long']]
zip_clean_3.head()

Unnamed: 0,Zipcode,State,Lat,Long
6440,26031,WV,40.01,-80.73
6441,26032,WV,40.2,-80.56
6442,26033,WV,39.82,-80.57
6443,26034,WV,40.61,-80.56
6444,26035,WV,40.34,-80.55


In [8]:
# Determines distance between two sets of coordinates
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a))

# Determines closest weather station based on given coordinates
def closest(data, v):
    data_dict = data[['LATITUDE', 'LONGITUDE']].to_dict('records')
    closest_station = min(data_dict, key=lambda p: distance(v['lat'],v['lon'],p['LATITUDE'],p['LONGITUDE']))
    station_row = stations.loc[(stations['LATITUDE'] == closest_station['LATITUDE']) 
                               & (stations['LONGITUDE'] == closest_station['LONGITUDE'])]
    station_name = station_row['NAME'].unique()[0]
    return(station_name)

In [11]:
zip_clean_4 = zip_clean_3.copy()

for index, row in zip_clean_4.iterrows():
    coords = {}
    coords['lat'] = row['Lat']
    coords['lon'] = row['Long']
    zip_clean_4.loc[index, 'Closest Weather Station'] = closest(stations, coords)

zip_clean_4

Unnamed: 0,Zipcode,State,Lat,Long,Closest Weather Station
6440,26031,WV,40.01,-80.73,"WHEELING, WV US"
6441,26032,WV,40.20,-80.56,"WHEELING OHIO CO AIRPORT, WV US"
6442,26033,WV,39.82,-80.57,"MOUNDSVILLE, WV US"
6443,26034,WV,40.61,-80.56,"WHEELING OHIO CO AIRPORT, WV US"
6444,26035,WV,40.34,-80.55,"WHEELING OHIO CO AIRPORT, WV US"
...,...,...,...,...,...
42375,36501,AL,31.46,-87.73,"JACKSON, AL US"
42380,37245,TN,36.16,-86.78,"NASHVILLE INTERNATIONAL AIRPORT, TN US"
42381,37247,TN,36.16,-86.78,"NASHVILLE INTERNATIONAL AIRPORT, TN US"
42382,37248,TN,36.16,-86.78,"NASHVILLE INTERNATIONAL AIRPORT, TN US"


In [12]:
zip_clean_4.to_csv('zipcodes_master.csv', index=False)