In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import histogram, linspace
from bokeh.plotting import figure, show, output_file
import scipy.stats as stats



In [53]:
# Load the data from xslx (Both *_1979_1997.xslx, *_1998_2017.xslx)

park_1979_1997 = pd.read_excel("../data/changed_parked_data/Annual_All_Report_By_Park_1979_1997.xlsx", skiprows=9)
park_1998_2017 = pd.read_excel("../data/changed_parked_data/Annual_All_Report_By_Park_1998_2017.xlsx", skiprows=9)


In [54]:
# Merge the dataset into one dataframe
merged_dataset = pd.merge(park_1998_2017, park_1979_1997, on=['Park Name'],how='outer')

In [56]:
'''
Get 
* Recreational Visits
* Non Recreational Visits
* Misc Overnights 
* Concessioner_Camping
* Tent_Overnights
* RV_Overnights
* BackCountry_Overnights
* Non_Recreational_Overnights
* Non_Recreational_Hours
* Recreational_Hours
Use regex match to get the column names for all the data
\d+_Recreational_Visits along with Park Name
\d+_Non_Recreational_Visits along with Park Name

'''
recreational_visits = merged_dataset.filter(regex=("(\d+_Recreational_Visits)|(Park)"))
non_recreational_visits = merged_dataset.filter(regex=("(\d+_Non_Recreational_Visits)|(Park)"))
misc_overnights = merged_dataset.filter(regex=("(\d+_Misc_Overnights)|(Park)"))
concessioner_camping = merged_dataset.filter(regex=("(\d+_Concessioner_Camping)|(Park)"))
tent_overnights = merged_dataset.filter(regex=("(\d+_Tent_Overnights)|(Park)"))
rv_overnights = merged_dataset.filter(regex=("(\d+_RV_Overnights)|(Park)"))
backcountry_overnights = merged_dataset.filter(regex=("(\d+_BackCountry_Overnights)|(Park)"))
non_recreational_overnights = merged_dataset.filter(regex=("(\d+_Non_Recreational_Overnights)|(Park)"))
non_recreational_hours = merged_dataset.filter(regex=("(\d+_Non_Recreational_Hours)|(Park)"))
recreational_hours = merged_dataset.filter(regex=("(\d+_Recreational_Hours)|(Park)"))


In [33]:
'''
 Review the recreational vists and drop the average columns because they are not required
 Sorted the columns because the year is prefixed to the column names so it was easier to order them

'''

rec_visits = recreational_visits.drop(columns=['Average_1998_2017_Recreational_Visits', 'Average_1979_1997_Recreational_Visits'])
rec_visits_sorted = rec_visits.reindex(sorted(rec_visits.columns), axis=1)


In [34]:
'''
After doing some analysis on the National Park Service website 
I found the most visited parks to date from 1979 to 1997 are:

* Yosemite, Zion, Yellowstone, Grand Canyon, Rocky Mountain, Great Smoky Mountains

'''


# Construct the Visit data for graphing

yosemite_data = rec_visits_sorted[rec_visits_sorted['Park Name'] == 'Yosemite NP']
yosemite_visits = (yosemite_data.pivot_table(yosemite_data,index=["Yosemite NP"])).transpose()

zion_data = rec_visits_sorted[rec_visits_sorted['Park Name'] == 'Zion NP']
zion_visits = (zion_data.pivot_table(zion_data,index=["Zion NP"])).transpose()

yellowstone_data = rec_visits_sorted[rec_visits_sorted['Park Name'] == 'Yellowstone NP']
yellowstone_visits = (yellowstone_data.pivot_table(yellowstone_data,index=["Yellowstone NP"])).transpose()

grandcanyon_data = rec_visits_sorted[rec_visits_sorted['Park Name'] == 'Grand Canyon NP']
grandcanyon_visits = (grandcanyon_data.pivot_table(grandcanyon_data,index=["Grand Canyon NP"])).transpose()

rockymoutain_data = rec_visits_sorted[rec_visits_sorted['Park Name'] == 'Rocky Mountain NP']
rockymoutain_visits = (rockymoutain_data.pivot_table(rockymoutain_data,index=["Rocky Mountain NP"])).transpose()

smokymountain_data = rec_visits_sorted[rec_visits_sorted['Park Name'] == 'Great Smoky Mountains NP']
smokymountain_visits = (smokymountain_data.pivot_table(smokymountain_data,index=["Great Smoky Mountains NP"])).transpose()




In [35]:
'''
Plot the recreational vists for top parks using bokeh
'''


N = 4000
x = np.random.random(size=N) * 100
y = np.random.random(size=N) * 100
radii = np.random.random(size=N) * 1.5
colors = [
    "#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)
]


TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"


p = figure(tools=TOOLS,title="Park Recreational Visits",plot_width=800, plot_height=800) # Title is the title of the  plot
p.xaxis.axis_label = 'Year' # X-axis label
p.yaxis.axis_label = 'Total Visits' # Y-axis label


year = [i for i in range(1979,2017)]
yosemite_visit_data = yosemite_visits['Yosemite NP']
yellowstone_visit_data = yellowstone_visits['Yellowstone NP']
zion_visit_data = zion_visits['Zion NP']
grandcanyon_visit_data = grandcanyon_visits['Grand Canyon NP']
rockymountain_visit_data = rockymoutain_visits['Rocky Mountain NP']
smokymountain_visit_data = smokymountain_visits['Great Smoky Mountains NP']

# add a line renderer
p.line(year, yosemite_visit_data,line_color="orange", line_width=2, alpha=0.7, legend="Yosemite")
p.line(year, yellowstone_visit_data,line_color="red", line_width=2, alpha=0.7, legend="Yellow Stone")
p.line(year, zion_visit_data,line_color="blue", line_width=2, alpha=0.7, legend="Zion")
p.line(year, grandcanyon_visit_data,line_color="brown", line_width=2, alpha=0.7, legend="Grand Canyon")
p.line(year, rockymountain_visit_data,line_color="purple", line_width=2, alpha=0.7, legend="Rocky Mountain")
p.line(year, smokymountain_visit_data,line_color="green", line_width=2, alpha=0.7, legend="Great Smoky Mountains")


# Disables the scientific numbers
p.left[0].formatter.use_scientific = False
p.below[0].formatter.use_scientific = False

output_file("recreational_visits.html", title="Recreational Visits")

show(p)



In [58]:
'''
 Review the tent_overnights  and drop the average columns because they are not required
 Sorted the columns because the year is prefixed to the column names so it was easier to order them

'''
tent_overnights_new = tent_overnights.drop(columns=['Average_1998_2017_Tent_Overnights', 'Average_1979_1997_Tent_Overnights'])

tent_overnights_sorted = tent_overnights_new.reindex(sorted(tent_overnights_new.columns), axis=1)

'''
After doing some analysis on the National Park Service website 
I found the most visited parks to date from 1979 to 1997 are:

* Yosemite, Zion, Yellowstone, Grand Canyon, Rocky Mountain, Great Smoky Mountains

'''


# Construct the Visit data for graphing

yosemite_data = tent_overnights_sorted[tent_overnights_sorted['Park Name'] == 'Yosemite NP']
yosemite_tents = (yosemite_data.pivot_table(yosemite_data,index=["Yosemite NP"])).transpose()

zion_data = tent_overnights_sorted[tent_overnights_sorted['Park Name'] == 'Zion NP']
zion_tents = (zion_data.pivot_table(zion_data,index=["Zion NP"])).transpose()

yellowstone_data = tent_overnights_sorted[tent_overnights_sorted['Park Name'] == 'Yellowstone NP']
yellowstone_tents = (yellowstone_data.pivot_table(yellowstone_data,index=["Yellowstone NP"])).transpose()

grandcanyon_data = tent_overnights_sorted[tent_overnights_sorted['Park Name'] == 'Grand Canyon NP']
grandcanyon_tents = (grandcanyon_data.pivot_table(grandcanyon_data,index=["Grand Canyon NP"])).transpose()

rockymoutain_data = tent_overnights_sorted[tent_overnights_sorted['Park Name'] == 'Rocky Mountain NP']
rockymoutain_tents = (rockymoutain_data.pivot_table(rockymoutain_data,index=["Rocky Mountain NP"])).transpose()

smokymountain_data = tent_overnights_sorted[tent_overnights_sorted['Park Name'] == 'Great Smoky Mountains NP']
smokymountain_tents = (smokymountain_data.pivot_table(smokymountain_data,index=["Great Smoky Mountains NP"])).transpose()



In [59]:
'''
Plot the Tent Camping for top parks using bokeh
'''


N = 4000
x = np.random.random(size=N) * 100
y = np.random.random(size=N) * 100
radii = np.random.random(size=N) * 1.5
colors = [
    "#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)
]


TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"


p = figure(tools=TOOLS,title="Tent Overnight Camping",plot_width=800, plot_height=800) # Title is the title of the  plot
p.xaxis.axis_label = 'Year' # X-axis label
p.yaxis.axis_label = 'Total People Tent Camping' # Y-axis label


year = [i for i in range(1979,2017)]
yosemite_tent_data = yosemite_tents['Yosemite NP']
yellowstone_tent_data = yellowstone_tents['Yellowstone NP']
zion_tent_data = zion_tents['Zion NP']
grandcanyon_tent_data = grandcanyon_tents['Grand Canyon NP']
rockymountain_tent_data = rockymoutain_tents['Rocky Mountain NP']
smokymountain_tent_data = smokymountain_tent['Great Smoky Mountains NP']

# add a line renderer
p.line(year, yosemite_tent_data,line_color="orange", line_width=2, alpha=0.7, legend="Yosemite")
p.line(year, yellowstone_tent_data,line_color="red", line_width=2, alpha=0.7, legend="Yellow Stone")
p.line(year, zion_tent_data,line_color="blue", line_width=2, alpha=0.7, legend="Zion")
p.line(year, grandcanyon_tent_data,line_color="brown", line_width=2, alpha=0.7, legend="Grand Canyon")
p.line(year, rockymountain_tent_data,line_color="purple", line_width=2, alpha=0.7, legend="Rocky Mountain")
p.line(year, smokymountain_tent_data,line_color="green", line_width=2, alpha=0.7, legend="Great Smoky Mountains")


# Disables the scientific numbers
p.left[0].formatter.use_scientific = False
p.below[0].formatter.use_scientific = False

output_file("tent_overnight.html", title="Tent Overnight Camping")

show(p)



In [62]:
'''
 Review the back_country_overnights  and drop the average columns because they are not required
 Sorted the columns because the year is prefixed to the column names so it was easier to order them

'''

backcountry_overnights_new = backcountry_overnights.drop(columns=['Average_1979_1997_BackCountry_Overnights'])
back_country_overnights_sorted = backcountry_overnights_new.reindex(sorted(backcountry_overnights_new.columns), axis=1)

'''
After doing some analysis on the National Park Service website 
I found the most visited parks to date from 1979 to 1997 are:

* Yosemite, Zion, Yellowstone, Grand Canyon, Rocky Mountain, Great Smoky Mountains

'''


# Construct the Visit data for graphing

yosemite_data = back_country_overnights_sorted[back_country_overnights_sorted['Park Name'] == 'Yosemite NP']
yosemite_back_country = (yosemite_data.pivot_table(yosemite_data,index=["Yosemite NP"])).transpose()

zion_data = back_country_overnights_sorted[back_country_overnights_sorted['Park Name'] == 'Zion NP']
zion_back_country = (zion_data.pivot_table(zion_data,index=["Zion NP"])).transpose()

yellowstone_data = back_country_overnights_sorted[back_country_overnights_sorted['Park Name'] == 'Yellowstone NP']
yellowstone_back_country = (yellowstone_data.pivot_table(yellowstone_data,index=["Yellowstone NP"])).transpose()

grandcanyon_data = back_country_overnights_sorted[back_country_overnights_sorted['Park Name'] == 'Grand Canyon NP']
grandcanyon_back_country = (grandcanyon_data.pivot_table(grandcanyon_data,index=["Grand Canyon NP"])).transpose()

rockymoutain_data = back_country_overnights_sorted[back_country_overnights_sorted['Park Name'] == 'Rocky Mountain NP']
rockymoutain_back_country = (rockymoutain_data.pivot_table(rockymoutain_data,index=["Rocky Mountain NP"])).transpose()

smokymountain_data = back_country_overnights_sorted[back_country_overnights_sorted['Park Name'] == 'Great Smoky Mountains NP']
smokymountain_back_country = (smokymountain_data.pivot_table(smokymountain_data,index=["Great Smoky Mountains NP"])).transpose()



In [63]:
'''
Plot the Back Country Camping for top parks using bokeh
'''


N = 4000
x = np.random.random(size=N) * 100
y = np.random.random(size=N) * 100
radii = np.random.random(size=N) * 1.5
colors = [
    "#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)
]


TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"


p = figure(tools=TOOLS,title="Back Country Camping",plot_width=800, plot_height=800) # Title is the title of the  plot
p.xaxis.axis_label = 'Year' # X-axis label
p.yaxis.axis_label = 'Total People Back Country Camping' # Y-axis label


year = [i for i in range(1979,2017)]
yosemite_back_country_data = yosemite_back_country['Yosemite NP']
yellowstone_back_country_data = yellowstone_back_country['Yellowstone NP']
zion_back_country_data = zion_back_country['Zion NP']
grandcanyon_back_country_data = grandcanyon_back_country['Grand Canyon NP']
rockymountain_back_country_data = rockymoutain_back_country['Rocky Mountain NP']
smokymountain_back_country_data = smokymountain_back_country['Great Smoky Mountains NP']

# add a line renderer
p.line(year, yosemite_back_country_data,line_color="orange", line_width=2, alpha=0.7, legend="Yosemite")
p.line(year, yellowstone_back_country_data,line_color="red", line_width=2, alpha=0.7, legend="Yellow Stone")
p.line(year, zion_back_country_data,line_color="blue", line_width=2, alpha=0.7, legend="Zion")
p.line(year, grandcanyon_back_country_data,line_color="brown", line_width=2, alpha=0.7, legend="Grand Canyon")
p.line(year, rockymountain_back_country_data,line_color="purple", line_width=2, alpha=0.7, legend="Rocky Mountain")
p.line(year, smokymountain_back_country_data,line_color="green", line_width=2, alpha=0.7, legend="Great Smoky Mountains")


# Disables the scientific numbers
p.left[0].formatter.use_scientific = False
p.below[0].formatter.use_scientific = False

output_file("back_country_overnight.html", title="Back Counter Overnight Camping")

show(p)



In [64]:
print(yosemite_back_country_data.mean())

120676.46153846153


In [65]:
parks = ['Zion','Grand Canyon','Yellow Stone', 'Yosemite','Rocky Mountain']

In [70]:
back_country_data = [zion_back_country_data.mean(), grandcanyon_back_country_data.mean(), \
                    yellowstone_back_country_data.mean(), yosemite_back_country_data.mean(), \
                     rockymountain_back_country_data.mean(),smokymountain_back_country_data.mean() ]
tent_data = [zion_tent_data.mean(), grandcanyon_tent_data.mean(), \
                    yellowstone_tent_data.mean(), yosemite_tent_data.mean(), \
                     rockymountain_tent_data.mean(),smokymountain_tent_data.mean() ]

chi_square_data = [back_country_data, tent_data]



In [71]:
chi_square_data

[[20878.53846153846,
  199852.10256410256,
  40493.07692307692,
  120676.46153846153,
  39581.5641025641,
  82005.92307692308],
 [104329.79487179487,
  135386.41025641025,
  142106.87179487178,
  547776.282051282,
  101238.56410256411,
  175953.6923076923]]

In [73]:
# Calculate the Chi-square test

g, p, dof, expctd = stats.chi2_contingency(chi_square_data)

print("Test Statistic : {}".format(g))
print("P-value of the chi-square test : {}".format(p))
print("Degrees of Freedom : {}".format(dof))
print("Expected Frequencies: ")
print(expctd)

Test Statistic : 203937.53703383773
P-value of the chi-square test : 0.0
Degrees of Freedom : 5
Expected Frequencies: 
[[ 36859.9750104   98690.58133848  53755.44396655 196785.2360015
   41455.91805606  75940.51229367]
 [ 88348.35832293 236547.93148203 128844.5047514  471667.50758825
   99364.21014906 182019.10309094]]


 Ran Test Statistic against a calculator and got 
 https://www.socscistatistics.com/pvalues/chidistribution.aspx

 The P-Value is < .00001. The result is significant at p < .05