# U.S. National Park Service Analysis of Gigi Jones
## Project 1 partner: Pratixa Shah 
This notebook shows Gigi's share of the analysis used for the project 1. Another notebook, which has the other half of the analysis, represents Pratixa's work. We found that it was not necessary to combine our notebooks into one since we each completed separate, independent analyses. However, we were in constant communication by  discussing our progress, coding, analytical approach, and the story direction to ensure we completed this project on time. Given that we had less than a week and half to complete this project, we believe we met our goal of applying the coding/analytical lessons we have learned so far in order to tell an interesting, compelling story with the available data on the U.S. National Parks.


In [1]:
# Getting dependencies.
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

# Set global Jupyter Notebook preferences and formatting
plt.ioff()
pd.options.display.float_format = '{:20,.0f}'.format

# Read files into Jupyter Notebook
file_path="Data_Resources/park_type_and_region.csv"
parkcode_df = pd.read_csv(file_path)
#parkcode_df.head()

file_path="Data_Resources/monthly_park_visits.csv"
monthly_visits_df = pd.read_csv(file_path)
monthly_visits_df.isnull().sum()

parkCode      0
Park Name     0
JAN_2019     10
FEB_2019      6
MAR_2019      5
APR_2019      5
MAY_2019      2
JUN_2019      0
JUL_2019      0
AUG_2019      0
SEP_2019      0
OCT_2019      0
NOV_2019      0
DEC_2019      0
JAN_2020     10
FEB_2020      6
MAR_2020      5
APR_2020      5
MAY_2020      2
JUN_2020      0
JUL_2020      0
AUG_2020      0
SEP_2020      0
OCT_2020      0
NOV_2020      0
DEC_2020      0
dtype: int64

In [2]:
# Start cleaning data.
monthly_visits_df = monthly_visits_df.fillna(0)
monthly_visits_df.isnull().sum()

parkCode     0
Park Name    0
JAN_2019     0
FEB_2019     0
MAR_2019     0
APR_2019     0
MAY_2019     0
JUN_2019     0
JUL_2019     0
AUG_2019     0
SEP_2019     0
OCT_2019     0
NOV_2019     0
DEC_2019     0
JAN_2020     0
FEB_2020     0
MAR_2020     0
APR_2020     0
MAY_2020     0
JUN_2020     0
JUL_2020     0
AUG_2020     0
SEP_2020     0
OCT_2020     0
NOV_2020     0
DEC_2020     0
dtype: int64

In [3]:
# Take a look at the dataframe.
monthly_visits_df.columns

Index(['parkCode', 'Park Name', 'JAN_2019', 'FEB_2019', 'MAR_2019', 'APR_2019',
       'MAY_2019', 'JUN_2019', 'JUL_2019', 'AUG_2019', 'SEP_2019', 'OCT_2019',
       'NOV_2019', 'DEC_2019', 'JAN_2020', 'FEB_2020', 'MAR_2020', 'APR_2020',
       'MAY_2020', 'JUN_2020', 'JUL_2020', 'AUG_2020', 'SEP_2020', 'OCT_2020',
       'NOV_2020', 'DEC_2020'],
      dtype='object')

In [4]:
# Merging data files of visits by month and park characteristics.
month_df = pd.merge(monthly_visits_df, parkcode_df, on="parkCode", how="outer")
month_df

Unnamed: 0,parkCode,Park Name,JAN_2019,FEB_2019,MAR_2019,APR_2019,MAY_2019,JUN_2019,JUL_2019,AUG_2019,...,MAY_2020,JUN_2020,JUL_2020,AUG_2020,SEP_2020,OCT_2020,NOV_2020,DEC_2020,parkType,Region
0,ABLI,Abraham Lincoln Birthplace NHP,1939,6534,14566,16496,24646,26854,37334,34816,...,21102,29111,37249,31290,29134,18615,12147,7472,National Historical Park,Southeast
1,ACAD,Acadia NP,8532,9154,20161,68046,226156,490720,756439,759735,...,103120,201156,493971,681746,500320,480859,76251,21260,National Park,Northeast
2,ADAM,Adams NHP,27,179,253,537,13187,23963,75333,26011,...,743,759,460,520,575,500,491,544,National Historical Park,Northeast
3,AFBG,African Burial Ground NM,96,1571,1395,5022,5482,4621,5520,4332,...,0,0,0,0,0,446,446,116,National Monument,Northeast
4,AGFO,Agate Fossil Beds NM,0,94,401,391,2188,3276,4222,2775,...,310,834,2377,1686,1558,978,252,121,National Monument,Midwest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,YELL,Yellowstone NP,33896,31650,28695,48150,434385,781853,936062,820006,...,46219,573205,955645,881829,837114,359889,24132,28643,National Park,Intermountain
375,YOSE,Yosemite NP,116746,111665,173610,297207,393004,496625,717462,703153,...,0,236534,501068,375226,258561,249827,136799,117454,National Park,Pacific West
376,YUCH,Yukon-Charley Rivers NPRES,6,72,46,37,14,182,354,246,...,25,72,149,164,88,89,8,20,National Preserve,Alaska
377,ZION,Zion NP,98828,102269,296316,372349,529553,594896,629802,535322,...,178194,377425,449518,450183,520987,559342,319853,227244,National Park,Intermountain


In [5]:
# Removed that last park in the list with NaN.
month_df = month_df.iloc[:-1]
month_df

Unnamed: 0,parkCode,Park Name,JAN_2019,FEB_2019,MAR_2019,APR_2019,MAY_2019,JUN_2019,JUL_2019,AUG_2019,...,MAY_2020,JUN_2020,JUL_2020,AUG_2020,SEP_2020,OCT_2020,NOV_2020,DEC_2020,parkType,Region
0,ABLI,Abraham Lincoln Birthplace NHP,1939,6534,14566,16496,24646,26854,37334,34816,...,21102,29111,37249,31290,29134,18615,12147,7472,National Historical Park,Southeast
1,ACAD,Acadia NP,8532,9154,20161,68046,226156,490720,756439,759735,...,103120,201156,493971,681746,500320,480859,76251,21260,National Park,Northeast
2,ADAM,Adams NHP,27,179,253,537,13187,23963,75333,26011,...,743,759,460,520,575,500,491,544,National Historical Park,Northeast
3,AFBG,African Burial Ground NM,96,1571,1395,5022,5482,4621,5520,4332,...,0,0,0,0,0,446,446,116,National Monument,Northeast
4,AGFO,Agate Fossil Beds NM,0,94,401,391,2188,3276,4222,2775,...,310,834,2377,1686,1558,978,252,121,National Monument,Midwest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,WUPA,Wupatki NM,724,4243,17447,14767,14117,27113,26993,24528,...,0,17432,19276,22804,24198,17215,12067,8339,National Monument,Intermountain
374,YELL,Yellowstone NP,33896,31650,28695,48150,434385,781853,936062,820006,...,46219,573205,955645,881829,837114,359889,24132,28643,National Park,Intermountain
375,YOSE,Yosemite NP,116746,111665,173610,297207,393004,496625,717462,703153,...,0,236534,501068,375226,258561,249827,136799,117454,National Park,Pacific West
376,YUCH,Yukon-Charley Rivers NPRES,6,72,46,37,14,182,354,246,...,25,72,149,164,88,89,8,20,National Preserve,Alaska


In [6]:
# Make new column that sums the months of 2019 for each park.
month_df['total_2019']=month_df.iloc[:, 2:14].sum(axis=1)
month_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  month_df['total_2019']=month_df.iloc[:, 2:14].sum(axis=1)


Unnamed: 0,parkCode,Park Name,JAN_2019,FEB_2019,MAR_2019,APR_2019,MAY_2019,JUN_2019,JUL_2019,AUG_2019,...,JUN_2020,JUL_2020,AUG_2020,SEP_2020,OCT_2020,NOV_2020,DEC_2020,parkType,Region,total_2019
0,ABLI,Abraham Lincoln Birthplace NHP,1939,6534,14566,16496,24646,26854,37334,34816,...,29111,37249,31290,29134,18615,12147,7472,National Historical Park,Southeast,238226
1,ACAD,Acadia NP,8532,9154,20161,68046,226156,490720,756439,759735,...,201156,493971,681746,500320,480859,76251,21260,National Park,Northeast,3437286
2,ADAM,Adams NHP,27,179,253,537,13187,23963,75333,26011,...,759,460,520,575,500,491,544,National Historical Park,Northeast,187400
3,AFBG,African Burial Ground NM,96,1571,1395,5022,5482,4621,5520,4332,...,0,0,0,0,446,446,116,National Monument,Northeast,47427
4,AGFO,Agate Fossil Beds NM,0,94,401,391,2188,3276,4222,2775,...,834,2377,1686,1558,978,252,121,National Monument,Midwest,16657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,WUPA,Wupatki NM,724,4243,17447,14767,14117,27113,26993,24528,...,17432,19276,22804,24198,17215,12067,8339,National Monument,Intermountain,187059
374,YELL,Yellowstone NP,33896,31650,28695,48150,434385,781853,936062,820006,...,573205,955645,881829,837114,359889,24132,28643,National Park,Intermountain,4020288
375,YOSE,Yosemite NP,116746,111665,173610,297207,393004,496625,717462,703153,...,236534,501068,375226,258561,249827,136799,117454,National Park,Pacific West,4422861
376,YUCH,Yukon-Charley Rivers NPRES,6,72,46,37,14,182,354,246,...,72,149,164,88,89,8,20,National Preserve,Alaska,1114


In [7]:
# Make new column that sums the months of 2020 for each park.
month_df['total_2020']=month_df.iloc[:, 14:27].sum(axis=1)
month_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  month_df['total_2020']=month_df.iloc[:, 14:27].sum(axis=1)


Unnamed: 0,parkCode,Park Name,JAN_2019,FEB_2019,MAR_2019,APR_2019,MAY_2019,JUN_2019,JUL_2019,AUG_2019,...,JUL_2020,AUG_2020,SEP_2020,OCT_2020,NOV_2020,DEC_2020,parkType,Region,total_2019,total_2020
0,ABLI,Abraham Lincoln Birthplace NHP,1939,6534,14566,16496,24646,26854,37334,34816,...,37249,31290,29134,18615,12147,7472,National Historical Park,Southeast,238226,228141
1,ACAD,Acadia NP,8532,9154,20161,68046,226156,490720,756439,759735,...,493971,681746,500320,480859,76251,21260,National Park,Northeast,3437286,2669034
2,ADAM,Adams NHP,27,179,253,537,13187,23963,75333,26011,...,460,520,575,500,491,544,National Historical Park,Northeast,187400,6937
3,AFBG,African Burial Ground NM,96,1571,1395,5022,5482,4621,5520,4332,...,0,0,0,446,446,116,National Monument,Northeast,47427,7908
4,AGFO,Agate Fossil Beds NM,0,94,401,391,2188,3276,4222,2775,...,2377,1686,1558,978,252,121,National Monument,Midwest,16657,8722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,WUPA,Wupatki NM,724,4243,17447,14767,14117,27113,26993,24528,...,19276,22804,24198,17215,12067,8339,National Monument,Intermountain,187059,146074
374,YELL,Yellowstone NP,33896,31650,28695,48150,434385,781853,936062,820006,...,955645,881829,837114,359889,24132,28643,National Park,Intermountain,4020288,3806306
375,YOSE,Yosemite NP,116746,111665,173610,297207,393004,496625,717462,703153,...,501068,375226,258561,249827,136799,117454,National Park,Pacific West,4422861,2268313
376,YUCH,Yukon-Charley Rivers NPRES,6,72,46,37,14,182,354,246,...,149,164,88,89,8,20,National Preserve,Alaska,1114,666


In [8]:
# ANALYSIS 1: Sum the Total Visits for 2019 v. 2020

total_visit2019 = month_df['total_2019'].sum()
total_visit2019


327516619.0

In [9]:
total_visit2020 = month_df['total_2020'].sum()
total_visit2020

235972454.0

In [10]:
# ANALYSIS 2: Impact of COVID-19 on Summer Visits
# Extract data for 2019 summer months: June, July, and August.
jun2019 = month_df.groupby('parkType')['JUN_2019'].sum()
jul2019 = month_df.groupby('parkType')['JUL_2019'].sum()
aug2019 = month_df.groupby('parkType')['AUG_2019'].sum()

summer2019 = jun2019 + jul2019 + aug2019
summer2019.sort_values()

parkType
International Historic Site                   6,972
National Reserve                             54,177
National Wild & Scenic River                732,025
National Battlefield                        784,855
National Preserve                         1,015,178
National Battlefield Park                 1,152,967
National Military Park                    1,332,896
National River                            1,688,629
National Lakeshore                        1,802,531
National Historic Site                    3,093,903
Park (Other)                              3,201,810
National Seashore                         7,960,714
National Monument                        10,030,522
National Parkway                         10,402,600
National Memorial                        12,552,395
National Historical Park                 13,095,280
National Recreation Area                 18,612,413
National Park                            40,226,378
dtype: float64

In [11]:
# Extract data for 2020 summer months: June, July, and August.
jun2020 = month_df.groupby('parkType')['JUN_2020'].sum()
jul2020 = month_df.groupby('parkType')['JUL_2020'].sum()
aug2020 = month_df.groupby('parkType')['AUG_2020'].sum()


summer2020 = jun2020 + jul2020 + aug2020
summer2020.sort_values()

parkType
International Historic Site                   3,923
National Reserve                             48,187
National Battlefield                        603,402
National Wild & Scenic River                869,903
National Military Park                      925,031
National Preserve                         1,025,094
National Historic Site                    1,080,205
National Battlefield Park                 1,081,845
Park (Other)                              1,566,027
National River                            1,943,731
National Lakeshore                        2,148,018
National Monument                         2,908,608
National Memorial                         4,441,835
National Historical Park                  5,739,228
National Seashore                         7,961,010
National Parkway                          9,823,600
National Recreation Area                 17,871,213
National Park                            29,664,949
dtype: float64

In [12]:
# Create a multi-bar chart, originally thought to add the Total 
# as another bar but the other bars looked too small in comparison. 

w = 0.4

x = ["Park", "Recreation Area", "Historical Park", "Monument"]
data2019 = [38466358, 18612413, 13095280, 12552395]
data2020 = [28387823, 17871213, 5739228, 2908608]

bar1 = np.arange(len(x))
bar2 = [i+w for i in bar1]

plt.bar(bar1, data2019,w,label="2019", color="tan")
plt.bar(bar2, data2020,w,label="2020", color="forestgreen")

plt.xlabel("Park Types",fontsize=16, color='black',fontweight='bold')
plt.ylabel("Number of Visits",fontsize=16, color='black',fontweight='bold')
plt.title("Summer Visits 2019 v. 2020",fontsize=16, color='black',fontweight='bold')
plt.xticks(bar1+w/2, x)
plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

In [13]:
# ANALYSIS 3: Impact of COVID-19 on Monthly Visits by Region

x_labels = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
years = {'2019': 'tan', '2020': 'forestgreen'}

regions = month_df['Region'].unique().tolist()

region_data = {}
for c_year in years:
    for x_label in x_labels:
        month_year = x_label+'_'+c_year
        region_data[month_year] = month_df.groupby('Region')[month_year].sum()

for region in regions:
    handles = []
    for c_year in years:
        y_data = []
        for x_label in x_labels:
            month_year = x_label+'_'+c_year
            y_data.append(region_data[month_year][region])
        color = years[c_year]
        plt.plot(y_data, color=color)
        handles.append(mpatches.Patch(color=color, label=c_year))

    plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)
    plt.title(region, fontsize=16, color='black',fontweight='bold')
    plt.xlabel('Month',fontsize=16, color='black',fontweight='bold')
    plt.ylabel('Number of Visits',fontsize=16, color='black',fontweight='bold')
    plt.xticks(np.arange(len(x_labels)), x_labels)
    plt.legend(handles=handles)
    plt.rcParams['figure.figsize'] = [8.5,5]
    plt.grid()
    plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# Read in a new data file that provides historical park visits for the past 10 years

file_path="Data_Resources/Annual Visitation By Park (1979 - Last Calendar Year).xlsx"

annual_visitation_df = pd.read_excel(file_path)
#annual_visitation_df.dtypes

file_path="Data_Resources/park_type_and_region.csv"

park_info_df = pd.read_csv(file_path)
#park_info_df.dtypes

# MERGE data files together
df = pd.merge(annual_visitation_df, park_info_df, on="parkCode", how='outer')
#df

In [15]:
# Convert select data years an convert into floats

annual_visitation_df['park_2011'] = annual_visitation_df['park_2011'].astype(float)
annual_visitation_df['park_2012'] = annual_visitation_df['park_2012'].astype(float)
annual_visitation_df['park_2013'] = annual_visitation_df['park_2013'].astype(float)
annual_visitation_df['park_2014'] = annual_visitation_df['park_2014'].astype(float)
annual_visitation_df['park_2015'] = annual_visitation_df['park_2015'].astype(float)
annual_visitation_df['park_2016'] = annual_visitation_df['park_2016'].astype(float)
annual_visitation_df['park_2017'] = annual_visitation_df['park_2017'].astype(float)
annual_visitation_df['park_2018'] = annual_visitation_df['park_2018'].astype(float)
annual_visitation_df['park_2019'] = annual_visitation_df['park_2019'].astype(float)
annual_visitation_df.dtypes

parkCode      object
Park Name     object
park_2010    float64
park_2011    float64
park_2012    float64
park_2013    float64
park_2014    float64
park_2015    float64
park_2016    float64
park_2017    float64
park_2018    float64
park_2019    float64
Average      float64
dtype: object

In [16]:
# ANALYSIS 4: Extract a sorted list of park types for Average Yearly Visits over a 10-year Period (Pre-COVID)
    
sum10_df = df.groupby('parkType').sum()

sum10_df['10YearAvg'] = 0

for y in range(2010, 2020):
    column = f'park_{y}'
    sum10_df['10YearAvg'] += sum10_df[column]

sum10_df['10YearAvg'] /= 10
sum10_df.sort_values(by='10YearAvg', ascending=False)

Unnamed: 0_level_0,park_2010,park_2011,park_2012,park_2013,park_2014,park_2015,park_2016,park_2017,park_2018,park_2019,Average,10YearAvg
parkType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
National Park,71079019,68607024,71065451,69136047,73927020,80305331,87619211,89391395,87714135,91010784,78985542,78985542
National Recreation Area,49044088,47365896,44679478,43997058,45218953,46230396,50331201,52107818,51022484,52110142,48210751,48210751
National Memorial,29736288,30080078,34694611,33739378,35833896,40418147,42183414,40645496,38111959,37653264,36639522,36309653
National Historical Park,29776071,29408774,30392954,30321111,32264620,32287173,34641708,35241069,33161620,34732138,32436762,32222724
National Parkway,28576098,29713173,29389054,27479002,28525215,29557215,32802051,31415912,29846041,30186057,29748982,29748982
National Monument,20014861,19550656,18402571,15452762,21378782,21820871,22303110,22719726,21577687,22406247,21251303,20562727
National Seashore,18118155,18801331,18877720,18664804,18546676,18706214,19797234,18472367,18325744,20273975,18858422,18858422
National Historic Site,9014086,8857717,9355960,8831518,9080406,9199517,10449789,9668115,8664509,8850017,9208213,9197163
Park (Other),8132297,7762848,6651772,7337542,8642253,8496867,8942596,9334814,8691360,8489113,8248146,8248146
National Military Park,4565799,5053113,5072289,5050057,4444083,4546993,4807742,4698928,4347012,4380576,4696659,4696659


In [17]:
# Read in a new data file that provides the overnight stays for 2019

file_path="Data_Resources/2019Fiscal_Year_Visitation_Report.csv"

overnight_stays_df = pd.read_csv(file_path)
#overnight_stays_df


In [18]:
# ANALYSIS 5: Examine the overnight stays for 2019 with a Pie Chart
# Rename the column names for chart readibility

renamed_stays_df = overnight_stays_df.rename(columns={'TotalConcessionerLodging2019': 'Lodging', 'TotalConcessionerCamping2019': 'Camping',
                                   'TotalTentOvernights2019': 'Tent', 'TotalRecreationVehicleOvernights2019': 'RV', 
                                   'TotalBackcountryOvernights2019': 'Backcountry',
                                   'TotalMiscellaneousOvernights2019': 'Misc.',
                                   'TotalNonRecreationOvernights2019': 'Non-Recreation'})

# renamed_stays_df.head()


In [19]:
# Create a pie chart on the overnight stays

%matplotlib notebook

# Import our dependencies
import matplotlib.pyplot as plt
import numpy as np

# Extract the last seven columns to be charted
labels = renamed_stays_df.columns.values.tolist()[-7:]
sizes = renamed_stays_df.loc[0].values.tolist()[-7:]
sizes = [int(i.replace(',','')) for i in sizes]

# The colors of each section of the pie chart
colors = ["tan", "sienna", "forestgreen", "oldlace","linen", "bisque", "black"]

# Tells matplotlib to seperate the "Humans" section from the others
explode = (0, 0, 0.10, 0, 0, 0, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", radius=1*.4, shadow=False, startangle=140)
plt.title("Percentage of Overnight Recreational Stays, 2019\nTotal: 13,619,350",fontsize=15,color="black",fontweight="bold")

# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")

<IPython.core.display.Javascript object>

(-0.4456822526416237,
 0.4870443763371684,
 -0.5325895312282032,
 0.44797186186975846)

In [20]:
# ANALYSIS 6A: Identify the Top 10 Visits by Park (2010-2019 averaged * Pre-COVID19)
annual_visitation_df.sort_values(by=['Average'], ascending=False).head(10)


# Top 10
# 1. Golden Gate NRA (14,840,800)
# 2. Blue Ridge PKWY (14,791,419)
# 3. Great Smoky Mountains NP (10,494,546)
# 4. Gateway NRA (7,665,918)
# 5. George Washington MEM PKWY (7,654,910)
# 6. Lincoln Memorial (7,131,717)
# 7. Lake Mead NRA (7,048,517)
# 8. Natchez Trace PKWY (5,975,784)
# 9. Grand Canyon NP (5,252,922)
# 10. Vietnam Veterans MEM (4,681,521)

Unnamed: 0,parkCode,Park Name,park_2010,park_2011,park_2012,park_2013,park_2014,park_2015,park_2016,park_2017,park_2018,park_2019,Average
149,GOGA,Golden Gate NRA,14271503,14567487,14540338,14289121,15004420,14888537,15638777,14981897,15223697,15002227,14840800
32,BLRI,Blue Ridge PKWY,14517118,15382447,15205059,12877368,13941749,15054603,15175578,16093765,14690418,14976085,14791419
158,GRSM,Great Smoky Mountains NP,9463538,9008830,9685829,9354695,10099276,10712674,11312786,11338893,11421200,12547743,10494546
137,GATE,Gateway NRA,8820757,7697727,5043863,6191246,6021713,6392565,8651770,9190610,9243305,9405622,7665918
143,GWMP,George Washington MEM PKWY,6925099,7417397,7425577,7360392,7472150,7286463,10323339,7562793,7288623,7487265,7654910
216,LINC,Lincoln Memorial,6042315,5971220,6191361,6546518,7139072,7941771,7915934,7956117,7804683,7808182,7131717
207,LAKE,Lake Mead NRA,7080758,6396682,6285439,6344714,6942873,7298465,7175891,7882339,7578958,7499049,7048517
248,NATR,Natchez Trace PKWY,5910950,5765343,5560668,6012740,5846474,5785812,5891315,6326062,6362439,6296041,5975784
152,GRCA,Grand Canyon NP,4388386,4298178,4421352,4564840,4756771,5520736,5969811,6254238,6380495,5974411,5252922
353,VIVE,Vietnam Veterans MEM,4555371,4020127,4424407,4142721,4403467,5597077,5299713,5072589,4719148,4580587,4681521


In [21]:
# ANALYSIS 6B: Identify the Bottom 10 Visits by Park (2010-2020 averaged * Pre-COVID19)
annual_visitation_df.sort_values(by=['Average'], ascending=True).head(10)

# Bottom 10
# 1. Aniakchak NM & PRES (96)
# 2. Rio Grande W&SR (533)
# 3. Port Chicago Naval Magazine NMEM (892)
# 4. Thaddeus Kosciuszko NMEM (2,039)
# 5. Carter G. Woodson Home NHS (2,073)*
# 6. Yukon-Charley Rivers NPRES (2,118)
# 7. Bering Land Bridge NPRES (2,566)
# 8. Nicodemus NHS (3,230)
# 9. Eugene O'Neill NHS (3,249)
# 10. Sand Creek Massacre NHS (5,555)
# # *only three years of data - new park

Unnamed: 0,parkCode,Park Name,park_2010,park_2011,park_2012,park_2013,park_2014,park_2015,park_2016,park_2017,park_2018,park_2019,Average
10,ANIA,Aniakchak NM & PRES,62,57,19,134,134,153,100,100,100,100,96
295,RIGR,Rio Grande W&SR,1103,873,694,703,321,120,463,399,330,324,533
286,POCH,Port Chicago Naval Magazine NMEM,984,545,533,596,786,963,1942,1086,653,830,892
333,THKO,Thaddeus Kosciuszko NMEM,2888,1949,2045,1682,1475,1261,2794,2293,2077,1921,2038
54,CAWO,Carter G. Woodson Home NHS,0,0,0,0,0,0,0,1884,1954,2381,2073
377,YUCH,Yukon-Charley Rivers NPRES,6211,1718,1393,3914,2329,1133,1146,952,1272,1114,2118
23,BELA,Bering Land Bridge NPRES,2642,1890,2642,2642,2636,2642,2642,2642,2642,2642,2566
258,NICO,Nicodemus NHS,3448,2681,3505,3241,3374,3306,3552,2916,2738,3540,3230
100,EUON,Eugene O'Neill NHS,2445,2593,2789,2929,3202,3942,4287,3931,3432,2944,3249
316,SAND,Sand Creek Massacre NHS,4063,3935,4384,4795,7402,5887,6847,6535,6006,5701,5556


In [22]:
# ANALYSIS 7: FIND THE AVERAGE, MIN, AND MAX COST TO ENTER A PARK?

# Read in api file with cost data
api_file_path ="Data_Resources/nps_api_data"
nps_api_df = pd.read_csv(api_file_path)

#Read in characteristics of the state
state_file_path="Data_Resources/StateNameList.csv"
state_df = pd.read_csv(state_file_path)


In [23]:
# Merge the dataframes

park_df = pd.merge(nps_api_df, parkcode_df, on="parkCode", how='left')
#park_df

data_df = pd.merge(park_df,state_df, on='state', how='left')
#data_df

In [24]:
# Create cost dataframe, clean it, and find the summary statistics table through .describe()

cost_df = data_df.copy()

cost_df["parkType"].fillna("Unknown", inplace = True)
cost_df["Region"].fillna("Unknown", inplace = True)

cost_df["parkType"].replace({'Park (Other)':'Park Other/Unknown',
                                   'Unknown': 'Park Other/Unknown'}, inplace=True)
# park_region_df["parkType"].value_counts()
# park_region_df

cost_df['cost']=cost_df['cost'].astype(float)
cost_df['cost'].describe()

count                    468
mean                       5
std                       10
min                        0
25%                        0
50%                        0
75%                        6
max                       40
Name: cost, dtype: float64