In [1]:
import numpy as np
import pandas as pd
import requests
import json
import os
import math

In [2]:
# lets write a quick request to get all of the national park rec areas
endpoint = "https://ridb.recreation.gov/api/v1/recareas"
key=os.getenv("RIDBKEY")
parameters = {"offset":0 ,"apikey":key}  #
response = requests.get(url=endpoint, params=parameters)
data = json.loads(response.content)

In [3]:
# looks like the response comes back with two keys
print(data.keys(), "\n")

# Lets see what we got in this query
print(data['METADATA'])

# Just like the NPS data it looks like we get a list back for the data with a dictionary in the list
print("we get a back a {} for each rec area\n".format(type(data['RECDATA'][0])))

# for each rec area we get the info below
print("the dictionary contains the\n{}".format(data['RECDATA'][0].keys()))

math.ceil(data['METADATA']['RESULTS']['TOTAL_COUNT'] / 50)

dict_keys(['RECDATA', 'METADATA']) 

{'RESULTS': {'CURRENT_COUNT': 50, 'TOTAL_COUNT': 3751}, 'SEARCH_PARAMETERS': {'QUERY': '', 'LIMIT': 50, 'OFFSET': 0}}
we get a back a <class 'dict'> for each rec area

the dictionary contains the
dict_keys(['RecAreaID', 'OrgRecAreaID', 'ParentOrgID', 'RecAreaName', 'RecAreaDescription', 'RecAreaFeeDescription', 'RecAreaDirections', 'RecAreaPhone', 'RecAreaEmail', 'RecAreaReservationURL', 'RecAreaMapURL', 'GEOJSON', 'RecAreaLongitude', 'RecAreaLatitude', 'StayLimit', 'Keywords', 'Reservable', 'Enabled', 'LastUpdatedDate'])


76

In [4]:
# ok lets reuse the same 3 core formulas from before
def response_generator(endpoint, key, start, query=None):
    """This function will take in the parameters, key, and endpoint and return a response"""
    
    if query:
        parameters = {"query":"National_Park", "offset":start, "apikey":key}
    else: 
        parameters = {"offset":start, "apikey":key}
    

    response = requests.get(url=endpoint, params=parameters)
    
    return response

In [5]:
def parse_dict(tgt_dict, scope: list):
    
    """This is a quick group of formatting functions to take in a specific dictionary and give back a data frame"""
    
    count_parks = len(tgt_dict['RECDATA'])
    
    parks_loop = []
    
    for i in range(count_parks):

        # for each campsite we will add what we care about to a list
        specific_park = []
        
        for item in scope:
            
            # if the item is in the dictionary, add it to the list, if not add none
            if item in tgt_dict['RECDATA'][i].keys(): 
                specific_park.append(tgt_dict['RECDATA'][i][item])
            
            else:
                 specific_park.append(None)
        
        parks_loop.append(specific_park)

        
    return pd.DataFrame(parks_loop, columns=scope)



In [6]:
def request_loop(endpoint, key, scope,  size=10, query=False):
    
    df_lst_parks = []
    
    for i in range(size):

        #print("Getting {0} loop of {1}".format(i+1, size))
        
        response = response_generator(endpoint=endpoint, key=key, start=(i*50))
        
        if response.status_code == 200:
        
            # if we have a valid response we will convert it to a df
            response_dct = json.loads(response.content)
            
            df = parse_dict(response_dct, scope)
            
            # lets add it to a list of data frames
            df_lst_parks.append(df)
        
            # clear the dictionary to release memory and help with potential collision. 
            response_dct.clear()
        
        
        
    # last we will concat all the data frames and return them
    return pd.concat(df_lst_parks).reset_index().drop('index', axis=1)

In [7]:
endpoint = "https://ridb.recreation.gov/api/v1/recareas"
key = os.getenv("RIDBKEY")
scope = ['RecAreaID', 'OrgRecAreaID', 'ParentOrgID', 'RecAreaName', 'RecAreaDescription', 'RecAreaDirections', 
         'RecAreaPhone', 'RecAreaEmail', 'RecAreaReservationURL', 'RecAreaMapURL', 'GEOJSON', 
            'RecAreaLongitude', 'RecAreaLatitude', 'StayLimit', 'Keywords', 'Reservable', 'Enabled', 
            'LastUpdatedDate']




In [8]:
df_rec = request_loop(endpoint=endpoint, key=key, scope=scope, size=76, query=False)

In [9]:
# from our NPS learnings we know that these parks contain the string "National Park"
df_rec_nps = df_rec[df_rec.RecAreaName.str.lower().str.contains('national park')]
df_rec_nps.shape[0]

df_rec_nps.loc[:,'RecAreaID'] = df_rec_nps['RecAreaID'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [10]:
#Damm looks like the reservation data doesnt have the rec-area, but it does have the facility. 
# Looks like we will need ot make another api call to get all of the facilites

facility_lst = []

for rec_id in df_rec_nps.RecAreaID.values:

    endpoint = "https://ridb.recreation.gov/api/v1/recareas/{0}/facilities?limit=50&offset=0&full=true".format(rec_id)
    key=os.getenv("RIDBKEY")
    parameters = {"offset":0 ,"apikey":key}  #
    response = requests.get(url=endpoint, params=parameters)
    
    data = json.loads(response.content)

    dct = json.loads(response.content)
    
    try:
        dct['RECDATA'][0].keys()

        scope = ['FacilityID', 'LegacyFacilityID', 'OrgFacilityID', 'ParentRecAreaID','FacilityName', 
                 'FacilityTypeDescription',  'GEOJSON', 'CAMPSITE', 'RECAREA', 'Reservable', 'LastUpdatedDate', 'MEDIA']

        df = parse_dict(tgt_dict=dct, scope=scope)
        
        facility_lst.append(df)
    except:
        pass


In [11]:
# lets concat the list into a data frame
df_facilities = pd.concat(facility_lst)

# then convert the data
df_facilities['RecAreaID'] = [int(x[0]['RecAreaID']) for x in df_facilities.RECAREA.values]
df_facilities['RecAreaName'] = [x[0]['RecAreaName'] for x in df_facilities.RECAREA.values]

In [12]:
df_facilities.head()

Unnamed: 0,FacilityID,LegacyFacilityID,OrgFacilityID,ParentRecAreaID,FacilityName,FacilityTypeDescription,GEOJSON,CAMPSITE,RECAREA,Reservable,LastUpdatedDate,MEDIA,RecAreaID,RecAreaName
0,258775,,119,2716,Anaktuvuk Pass Ranger Station,Facility,"{'TYPE': 'Point', 'COORDINATES': [0, 0]}",[],"[{'RecAreaID': '2716', 'RecAreaName': 'Gates O...",False,2020-03-20,[],2716,Gates Of The Arctic National Park & Preserve
1,258777,,146,2716,Fairbanks Alaska Public Lands Information Center,Facility,"{'TYPE': 'Point', 'COORDINATES': [0, 0]}",[],"[{'RecAreaID': '2716', 'RecAreaName': 'Gates O...",False,2020-03-20,[],2716,Gates Of The Arctic National Park & Preserve
2,258774,,110,2716,Bettles Ranger Station and Visitor Center,Facility,"{'TYPE': 'Point', 'COORDINATES': [0, 0]}",[],"[{'RecAreaID': '2716', 'RecAreaName': 'Gates O...",False,2020-03-20,[],2716,Gates Of The Arctic National Park & Preserve
3,258776,,122,2716,Arctic Interagency Visitor Center,Facility,"{'TYPE': 'Point', 'COORDINATES': [0, 0]}",[],"[{'RecAreaID': '2716', 'RecAreaName': 'Gates O...",False,2020-03-20,[],2716,Gates Of The Arctic National Park & Preserve
0,232497,70979.0,AN370979,2631,SANTA ROSA ISLAND,Campground,"{'TYPE': 'Point', 'COORDINATES': [-120.0481472...",[],"[{'RecAreaID': '2631', 'RecAreaName': 'Channel...",True,2020-06-11,[{'EntityMediaID': 'c39e4668-4fbc-499e-bba9-4b...,2631,Channel Islands National Park


In [13]:
# lets grab a list of all the FacilitieIDs we care about
faciltieIds = df_facilities['FacilityID'].unique()

In [15]:
# ok time to look at the reservation data

# I put all of the reservations csvs from ridb online into a single directory Data/Reservations
df = pd.read_csv('../Data/Reservations/2017.csv', nrows=10) 

#I bet there are some dates in this frame that we could try to see if pandas with parese for us
print("date cols {0}".format([(i, x) for i,x in enumerate(list(df.columns)) if "Date" in x]))


# looking at these columns we will need to set up a few quick data types, mainly columns 34 and after

dct = {}

for item in [x for x in df.iloc[0:20,34:].columns]:
    dct[item] = str

# the easiest way I have found to combine dictionaries with unique keys is just to unpack them with **
dtype_dct = {**{"UseType":str, "FacilityZIP":str, "TotalPaid":str, 
                "StartDate":str, "EndDate":str, "OrderDate":str }, **dct}


date cols [(30, 'StartDate'), (31, 'EndDate'), (32, 'OrderDate')]


In [22]:
# lets check to make sure that all of the files have same columns
from glob import glob

# this generates a list of all the reservations csvs 
files = glob("../Data/Reservations/*")

cols_match = {}

for file in files:

    df_col = pd.read_csv(files[0], nrows=10, usecols=range(56))
    df_col2 = pd.read_csv(file, nrows=10, usecols=range(56))
    cols_match[file] = (all(df_col.columns.values == df_col2.columns.values), df_col2.columns.values)
    
print(cols_match[list(cols_match.keys())[0]])

(True, array(['HistoricalReservationID', 'OrderNumber', 'Agency', 'OrgID',
       'CodeHierarchy', 'RegionCode', 'RegionDescription',
       'ParentLocationID', 'ParentLocation', 'LegacyFacilityID', 'Park',
       'SiteType', 'UseType', 'ProductID', 'EntityType', 'EntityID',
       'FacilityID', 'FacilityZIP', 'FacilityState', 'FacilityLongitude',
       'FacilityLatitude', 'CustomerZIP', 'CustomerState',
       'CustomerCountry', 'Tax', 'UseFee', 'TranFee', 'AttrFee',
       'TotalBeforeTax', 'TotalPaid', 'StartDate', 'EndDate', 'OrderDate',
       'NumberOfPeople', 'Tent', 'Popup', 'Trailer', 'RVMotorhome',
       'Boat', 'HorseTrailer', 'Car', 'FifthWheel', 'Van', 'CanoeKayak',
       'BoatTrailer', 'Motorcycle', 'Truck', 'Bus', 'Bicycle',
       'Snowmobile', 'OffRoadlAllTerrainVehicle', 'PowerBoat',
       'PickupCamper', 'LargeTentOver9x12', 'SmallTent', 'Marinaboat'],
      dtype=object))


In [23]:
reservation_frames = []

for file in files:

    print(file)
    
    # So normally we would parse the data in the read_csv, but because there is many more lines 
    # than we need in the file lets do it after we read the data
    df = pd.read_csv(file, usecols=range(35), dtype=dtype_dct)
    
    # there is some bad data in the totalpaid column lets convert the data here and where errors insert nan
    facilites_reservations = df[df['FacilityID'].isin(faciltieIds)]
    overnight = facilites_reservations[facilites_reservations['UseType'].str.lower().str.contains("overnight").fillna(False)]
    
    # lets convert a few columns to float
    for col in ['UseFee', 'TranFee', 'AttrFee']:
        overnight.loc[:,col] = overnight[col].astype(dtype=float, errors='ignore')

    # lets convert some things to integers
    for col in ['NumberOfPeople', 'Tent', 'FacilityID']:    
        overnight.loc[:,col] = overnight[col].astype(dtype=int, errors='ignore')

    # lets convert the dates
    for col in ['StartDate', 'EndDate', 'OrderDate']:
        overnight.loc[:,col] = pd.to_datetime(overnight[col], errors='coerce', format="%Y-%m-%d")

    
    reservation_frames.append(overnight)
    
    # lets clear some extra data from memory
    df = pd.DataFrame()
    
res_nps = pd.concat(reservation_frames)

../Data/Reservations/2008.csv
../Data/Reservations/2009.csv
../Data/Reservations/2018.csv
../Data/Reservations/2015.csv
../Data/Reservations/2014.csv
../Data/Reservations/2016.csv
../Data/Reservations/2017.csv
../Data/Reservations/2013.csv
../Data/Reservations/2007.csv
../Data/Reservations/2006.csv
../Data/Reservations/2012.csv
../Data/Reservations/2010.csv
../Data/Reservations/2011.csv


In [24]:
# so over the last 13 years we have about million reservations. That seems very light for 62 parks
res_nps = res_nps.drop_duplicates(subset=['OrderNumber'], keep='first')
res_nps.shape[0]

4886956

In [25]:
# first lets rename/ drop some columns so we dont have repeats

df_facilities.rename(columns={"GEOJSON": "facilityGEOJSON", "Reservable": "ReservableFacility",
                             "LastUpdatedDate":"LastUpdatedDateFacility"}, inplace=True)

df_facilities2 = df_facilities.drop("RecAreaName", axis=1)

# and lets join together the rec areas with their facility information
df_nps_facilites = pd.merge(left = df_rec_nps, right = df_facilities2, how='left', on = ['RecAreaID'])

In [26]:
# lets convert facilityID to an interger so we can join our data together
df_nps_facilites['FacilityID'] = df_nps_facilites['FacilityID'].fillna(0).astype(int)

In [31]:
# Great lets join in the resevations and see what we have going on
df_nps_reservations = pd.merge(left=df_nps_facilites, right=res_nps, how='inner', on=['FacilityID'])

# lets also add the number of nights for each reservation
days_diff = df_nps_reservations.EndDate - df_nps_reservations.StartDate
df_nps_reservations['Nights'] = days_diff.dt.days


df_nps_reservations['StartYear'] = df_nps_reservations['StartDate'].dt.year
df_nps_reservations['StartMonth'] = df_nps_reservations['StartDate'].dt.month
df_nps_reservations['StartMonthYear'] = df_nps_reservations['StartYear'].astype(str) + "-" + \
                            df_nps_reservations['StartMonth'].astype(str).apply(lambda x: x.zfill(2))


In [32]:
df_nps_reservations.columns.values

array(['RecAreaID', 'OrgRecAreaID', 'ParentOrgID', 'RecAreaName',
       'RecAreaDescription', 'RecAreaDirections', 'RecAreaPhone',
       'RecAreaEmail', 'RecAreaReservationURL', 'RecAreaMapURL',
       'GEOJSON', 'RecAreaLongitude', 'RecAreaLatitude', 'StayLimit',
       'Keywords', 'Reservable', 'Enabled', 'LastUpdatedDate',
       'FacilityID', 'LegacyFacilityID_x', 'OrgFacilityID',
       'ParentRecAreaID', 'FacilityName', 'FacilityTypeDescription',
       'facilityGEOJSON', 'CAMPSITE', 'RECAREA', 'ReservableFacility',
       'LastUpdatedDateFacility', 'MEDIA', 'HistoricalReservationID',
       'OrderNumber', 'Agency', 'OrgID', 'CodeHierarchy', 'RegionCode',
       'RegionDescription', 'ParentLocationID', 'ParentLocation',
       'LegacyFacilityID_y', 'Park', 'SiteType', 'UseType', 'ProductID',
       'EntityType', 'EntityID', 'FacilityZIP', 'FacilityState',
       'FacilityLongitude', 'FacilityLatitude', 'CustomerZIP',
       'CustomerState', 'CustomerCountry', 'Tax', 'UseFee', '

In [96]:
df_nps_totals = df_nps_reservations.groupby(by='RecAreaName').agg({'Nights':'sum', 'OrderNumber':'count'}).reset_index()
df_nps_totals.rename(columns={'OrderNumber':'OrdersCount'}, inplace=True)
df_nps_totals = df_nps_totals.sort_values(by=['Nights'], ascending=[False])
df_nps_totals.head(10)

Unnamed: 0,RecAreaName,Nights,OrdersCount
34,Yosemite National Park,2172771.0,938880
12,Grand Canyon National Park,1150558.0,648669
16,Great Smoky Mountains National Park,992735.0,371242
35,Zion National Park,631801.0,306713
0,Acadia National Park,609079.0,199244
28,Rocky Mountain National Park,579603.0,272288
29,Sequoia & Kings Canyon National Parks,537067.0,230746
17,Joshua Tree National Park,360109.0,175561
11,Glacier National Park,351207.0,161441
30,Shenandoah National Park,328279.0,154146


In [43]:
df_nps_totals.tail(10)

Unnamed: 0,RecAreaName,Nights,OrdersCount
5,Canyonlands National Park,7721.0,3624
9,Congaree National Park,6768.0,4826
14,Great Basin National Park,4825.0,16560
19,Kenai Fjords National Park,591.0,299
20,Lake Clark National Park & Preserve,211.0,90
23,Mesa Verde National Park,0.0,5060
13,Grand Teton National Park,0.0,23709
32,Voyageurs National Park,0.0,19552
33,White Sands National Park,0.0,526
7,Carlsbad Caverns National Park,0.0,94838


In [44]:
df_nps_totals.shape[0]

36

In [101]:
# lets chart some of this data to see what is going on for a top rec area and bottom rec area
import plotly.express as px

def plot_park(park='Yosemite National Park'):

    park_res = df_nps_reservations.loc[df_nps_reservations.RecAreaName == park, :]

    # yearly Totals
    park_res_year = park_res.groupby(by='StartYear').agg({'Nights':'sum', 'OrderNumber':'count'}).reset_index()

    # year month Totals
    park_res_year_month = park_res.groupby(by=['StartYear','StartMonthYear', 'StartMonth']).agg({'Nights':'sum', 'OrderNumber':'count'}).reset_index()

    # Max Month 
    park_month = park_res.groupby(by=['StartMonth']).agg({'Nights':'sum', 'OrderNumber':'count'}).reset_index()
    max_month = park_month.loc[park_month.Nights == park_month.Nights.max(),:]['StartMonth'].values[0]

    # year month only max month
    max_month_res = park_res_year_month.loc[park_res_year_month['StartMonth'] == max_month]

    # Lets see some quick trends
    fig = px.bar(park_res_year, x='StartYear', y='Nights', 
                 title=f"{park} - Year Nights Outside", height = 300, width=800)
    fig.show()

    fig2 = px.bar(park_res_year_month, x='StartMonthYear', y='Nights', 
                  title=f"{park} - Seasonality Nights Outside", height = 400, width=800)
    fig2.show()

    fig4 = px.histogram(park_res_year_month, x='Nights', title=f"{park} - Hist Months Nights outside", 
                        height=400, width=900, marginal='box', ) #, histnorm='percent'
    fig4.show()

    
    fig3 = px.bar(max_month_res, x='StartMonthYear', y='Nights', 
                  title=f"{park} - Month {max_month} Nights Outside", height = 400, width=800)
    fig3.show()
    


In [102]:
plot_park()

In [103]:
plot_park('Canyonlands National Park')

In [108]:
top10_parks = df_nps_totals.head(10)['RecAreaName'].values
top10_parks_res = df_nps_reservations.loc[df_nps_reservations.RecAreaName.isin(top10_parks),:]

In [113]:
toppark_month = top10_parks_res.groupby(by=['StartYear','RecAreaName']).agg({'Nights':'sum', 'OrderNumber':'count'}).reset_index()

In [133]:
import plotly.graph_objects as go
StartYear = toppark_month.StartYear.unique()

data = []

for park in top10_parks:

    df = toppark_month[toppark_month['RecAreaName'] == park]
    y = [round(x/1000,1) for x in df['Nights'].values]
    data.append(go.Bar(name=park, x=df['StartYear'].values, y=y, text=y, 
                                   textposition='auto'))
    
fig6 = go.Figure(data)
fig6.update_layout(barmode='stack',  title_text='Annual Nights top 10 parks', height=900)
fig6.show()    
