In [47]:
import pandas as pd
import numpy as np
from glob import glob

import pdb

In [30]:
# first lets find all of our rec areas
df_rec = pd.read_csv('../Data/RIDB_attributes/RecAreas_API_v1.csv')

In [15]:
# lets see if we can use the wikipedia list of National Park Names to match them to recreaction area names
nps_names = pd.read_csv("../Data/OtherSource/wikipedia_nps_names.csv")
nps_names_lst = [x.lstrip('\xa0').strip('\xa0') for x in nps_names['NationalParks_clean'].values]

# lets see if the nps names are included in our rec_df filed RecAreaNames
nps_names_lst = np.array(nps_names_lst)

# if we create just one blob this will be much faster than iterating through the full numpy array df_rec['RecAreaName']
# dont believe me? I like you for thinking outside the box. Try %timeit with a few different ways to see which is fastest
all_names = ", ".join([str(x).lower() for x in df_rec['RecAreaName'].values])
[x for x in nps_names_lst if x not in all_names]


['haleakalā', 'hawaiʻi volcanoes']

In [None]:
# Great since we know the names are in both sets lets see if we cant create a 
# dictionary to help us get a more clear read of the data

In [60]:
def combine_lsts(nps, rec_area):
    """helfpul file to combine data sets"""
    
    ret_dict = {}
    
    for area in rec_area:
        lst = []
        
        for park in nps:
            if park.lower() in area.lower() :
                lst.append(park)       
        
        if len(lst) > 0:
            ret_dict[area] = ', '.join(lst)
            
    return ret_dict

In [76]:
# lets execute the funciton and look at the middle of the list
dct = combine_lsts(nps_names_lst, df_rec['RecAreaName'].astype(str).values)
list(dct.keys())[50:60]

# what do you know, all the national parks have "National Park" in the RecAreaName. That is going to really help us

['Gates Of The Arctic National Park & Preserve',
 'Glacier National Park',
 'Glacier Bay National Park & Preserve',
 'Great Basin National Park',
 'Grand Canyon National Park',
 'Great Sand Dunes National Park & Preserve',
 'Great Smoky Mountains National Park',
 'Guadalupe Mountains National Park',
 'Hot Springs National Park',
 'Indiana Dunes National Park']

In [133]:
# lets filter
count_np = sum(df_rec['RecAreaName'].str.contains('National Park').fillna(False))
print(f"We find {count_np} rec areas, even though we know there are 62")

# lets look some more at these data frames
df_rec_np = df_rec[df_rec['RecAreaName'].str.contains('National Park').fillna(False)]

# A quick look through 
df_rec_np['RecAreaID'].values

We find 66 rec areas, even though we know there are 62


array(['13525', '13951', '140042', '2554', '2573', '2578', '2584', '2588',
       '2592', '2599', '2616', '2617', '2622', '2631', '2644', '2647',
       '2652', '2658', '2662', '2665', '2677', '2716', '2725', '2726',
       '2732', '2733', '2738', '2739', '2744', '2751', '2753', '2760',
       '2767', '2769', '2782', '2786', '2787', '2795', '2799', '2803',
       '2818', '2824', '2835', '2845', '2847', '2856', '2881', '2893',
       '2907', '2917', '2931', '2933', '2949', '2967', '2970', '2979',
       '2980', '2986', '2988', '2991', '2994', '3134', '5190', '74275',
       '74292', '74998'], dtype=object)

In [136]:
# lookig at the list there are a few that are duplicates we will need to remove
df_rec_np.iloc[61,1] = 'remove'
df_rec_np.iloc[64,1] = 'remove'
df_rec_np.iloc[65,1] = 'remove'

# now we filter out the parks we dont want
df_rec_np = df_rec_np[df_rec_np['OrgRecAreaID'] != 'remove']

In [137]:
# alright so now lets see if we can figure out how to match this to the reservations data
# looks like the facility id might be the trick - and they have a mapping table to make this work

In [148]:
df_fac_rec = pd.read_csv( '../Data/RIDB_attributes/RecAreaFacilities_API_v1.csv')

print(df_fac_rec[['RecAreaID', "FacilityID"]].dtypes)

# Ok in the other data frame these are strings. Lets convert so we can merge the data together
df_fac_rec['RecAreaID'] = df_fac_rec['RecAreaID'].astype(str)
df_fac_rec['FacilityID'] = df_fac_rec['FacilityID'].astype(str)

# And now we merge
rec_facilities = pd.merge(left=df_rec_np, right=df_fac_rec, how='left', on=['RecAreaID'])

RecAreaID     int64
FacilityID    int64
dtype: object


In [141]:
#now we filter to find the parks with no facilities
rec_facilities[rec_facilities.FacilityID.isnull()]


# Checking the API - they really dont have good data here for these three rec areas. That is sad because I really enjoyed camping at the channel islands
# We are going to have to exlcude these national parks for now


Unnamed: 0,RecAreaID,OrgRecAreaID,ParentOrgID,RecAreaName,RecAreaDescription,RecAreaUseFeeDescription,RecAreaDirections,RecAreaPhone,RecAreaEmail,RecAreaReservationURL,RecAreaMapURL,RecAreaLongitude,RecAreaLatitude,StayLimit,Keywords,Reservable,Enabled,LastUpdatedDate,FacilityID
15,140042,,128,Channel Islands National Park,Channel Islands National Park,,,,,,,0.0,0.0,,,True,True,2020-03-19,
499,5190,NPNH,128,National Parks of New York Harbor,These 11 sites preserve more than 400 years of...,,See the websites for each of the ten national ...,(212) 668-5180,Minerva_Anderson@nps.gov,,,-74.045105,40.663192,,"Arts and Culture,Biking,Boating,Camping,Fishin...",False,True,2020-03-19,
500,74275,,128,Catoctin National Park,Catoctin National Park,,,,,,,0.0,0.0,,,True,True,2020-03-19,


In [176]:
rec_fac_df = rec_facilities[['RecAreaID', 'FacilityID']].dropna().drop_duplicates(subset=['RecAreaID', 'FacilityID'],keep='first')

rec_fac_lst = [*zip(rec_fac_df["RecAreaID"],rec_fac_df["FacilityID"])]
rec_fac_lst[:2]

[('13525', '247661'), ('13525', '247663')]

In [151]:
# what is the size of facilites we are thinknig about?
facilites_scope = rec_facilities['FacilityID'].dropna().unique().tolist()
print("We are going to include {0} facilities in our analysis".format(len(facilites_scope)))

# I am going to pass this list to my other notebooks to temporary look at reservations data
%store facilites_scope

We are going to include 498 facilities in our analysis
Stored 'facilites_scope' (list)


In [155]:
glob('../Data/RIDB_attributes/*')

['../Data/RIDB_attributes/EntityActivities_API_v1.csv',
 '../Data/RIDB_attributes/PermitEntranceZones_API_v1.csv',
 '../Data/RIDB_attributes/Media_API_v1.csv',
 '../Data/RIDB_attributes/FacilityAddresses_API_v1.csv',
 '../Data/RIDB_attributes/Facilities_API_v1.csv',
 '../Data/RIDB_attributes/Activities_API_v1.csv',
 '../Data/RIDB_attributes/Organizations_API_v1.csv',
 '../Data/RIDB_attributes/PermitEntrances_API_v1.csv',
 '../Data/RIDB_attributes/OrgEntities_API_v1.csv',
 '../Data/RIDB_attributes/Tours_API_v1.csv',
 '../Data/RIDB_attributes/CampsiteAttributes_API_v1.csv',
 '../Data/RIDB_attributes/Events_API_v1.csv',
 '../Data/RIDB_attributes/RecAreaFacilities_API_v1.csv',
 '../Data/RIDB_attributes/PermitEntranceAttributes_API_v1.csv',
 '../Data/RIDB_attributes/Links_API_v1.csv',
 '../Data/RIDB_attributes/TourAttributes_API_v1.csv',
 '../Data/RIDB_attributes/Campsites_API_v1.csv',
 '../Data/RIDB_attributes/RecAreaAddresses_API_v1.csv',
 '../Data/RIDB_attributes/MemberTours_API_v1.csv',

In [165]:
campsites = pd.read_csv("../Data/RIDB_attributes/Campsites_API_v1.csv")
fac_camp_count = campsites.groupby(by=['FacilityID']).agg({'CampsiteID':'count'}).reset_index()
fac_camp_count.rename(columns={"CampsiteID":"count_campsites"}, inplace=True)

In [166]:
fac_camp_count

Unnamed: 0,FacilityID,count_campsites
0,72417,12
1,72481,1
2,118290,14
3,118440,28
4,118990,1
...,...,...
3494,10040545,1
3495,10040547,17
3496,10040565,1
3497,10040567,1


In [167]:
campsites

Unnamed: 0,CampsiteID,FacilityID,CampsiteName,CampsiteType,TypeOfUse,Loop,CampsiteAccessible,CampsiteLongitude,CampsiteLatitude,CreatedDate,LastUpdatedDate
0,1,232446,065,STANDARD NONELECTRIC,Overnight,LOOP C,False,-119.683927,37.550130,2014-05-02,2020-02-07
1,10,232446,075,STANDARD NONELECTRIC,Overnight,LOOP C,False,-119.684614,37.550142,2014-05-02,2020-02-07
2,100,232447,044,STANDARD NONELECTRIC,Overnight,UP1,False,-119.565053,37.737630,2014-05-02,2020-02-07
3,1000,232451,078,STANDARD NONELECTRIC,Overnight,Hodgdon Campground,False,-119.866554,37.799595,2014-05-02,2020-02-07
4,10000,232709,009,STANDARD ELECTRIC,Overnight,REYNOLDS CREEK,False,-97.264983,31.586164,2014-05-02,2019-08-16
...,...,...,...,...,...,...,...,...,...,...,...
99622,999990060,256367,Masse Homestead,HIKE TO,Overnight,Masse Homestead,False,0.000000,0.000000,2018-04-04,2019-07-17
99623,999990061,251865,Presque Isle,STANDARD NONELECTRIC,Overnight,Stockton,False,0.000000,0.000000,2018-04-04,2020-01-07
99624,999990062,251865,Outer Primitive Zone,Zone,Overnight,Outer,False,0.000000,0.000000,2018-04-04,2019-12-03
99625,999990063,251865,Oak Primitive Zone,Zone,Overnight,Oak,False,0.000000,0.000000,2018-04-04,2019-12-03
