In [48]:
!pip install folium
!pip install boto3
!pip install censusgeocode

import numpy as np
import pandas as pd
import folium
from folium.plugins import MarkerCluster
import boto3
import pyspark as ps# for the pyspark suite
import census_tract as a
import src.api2 as ap
import src.co_pipeline3 as pipe




spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("case study") \
            .getOrCreate()

sc = spark.sparkContext

s3_connection = boto3.resource('s3')
s3_client = boto3.client('s3')

#Instantiating the boto resource and client for downloading/uploading files




In [13]:
bucket = 'capstone.1'

co_sites_df = load_csv_from_s3(bucket, 'co_sites.csv')

In [17]:
## Site by number of reservations

grpby_obj = co_sites_df.groupby(['Park', 'FacLong', 'FacLat'])
co_sites = grpby_obj.count().reset_index(drop=False)
co_sites = co_sites.drop(['Location', 'SiteType', 'UseType', 'FacState', 'CustZIP', 'CustCountry', 'CustSize', 'CustLat', 'CustLong','CustState', 'Dist'], axis=1)
co_sites = co_sites.rename(columns={'Res_ID':'num_res'})

site_map = pipe.all_sites_mapper(co_sites, 'num_res', 40, 'blue')
site_map.save('all_sites.html')


In [21]:
## Sites by mean distance traveled to get to the site

cust_dist = grpby_obj.Dist.agg(['min', 'max', 'mean']).reset_index(drop=False)

cust_dist_map = pipe.all_sites_mapper(cust_dist, 'mean', 40, 'green')
cust_dist_map.save('cust_dist.html')


In [28]:
mesa_zips = pipe.site_selector('Mesa Verde National Park', co_sites_df)
mesa_customers = pipe.site_customer_mapper(mesa_zips, 'Res_Size')
mesa_customers.save('mesa_customers.html')

In [22]:
ohaver_zips = pipe.site_selector('OHaver Lake --- FPIN46', co_sites_df)

In [25]:
ohaver_customers = pipe.site_customer_mapper(ohaver_zips, 'Res_Size')
ohaver_customers.save('ohaver_customers.html')

In [29]:
## Census Variable Clusters

econ_df = load_csv_from_s3(bucket, 'econ_var_names.csv')
social_df = load_csv_from_s3(bucket, 'social_var_names.csv')
demo_df = load_csv_from_s3(bucket, 'demo_var_names.csv')
econ_var_names = econ_df.to_numpy()
social_var_names = social_df.to_numpy()
demo_var_names = demo_df.to_numpy()

econ_clusters = {'Industry': (31, 43), 'Commute': (17, 22), 'Income_Benefits': (50, 59), 'Health_Insurance': (94, 96)}
industry = ap.cluster_variables(econ_var_names, econ_clusters['Industry'])
commute = ap.cluster_variables(econ_var_names, econ_clusters['Commute'])
income_benefits = ap.cluster_variables(econ_var_names, econ_clusters['Income_Benefits'])
health = ap.cluster_variables(econ_var_names, econ_clusters['Health_Insurance'])

social_clusters = {'Internet': (149, 150), 'Language': (109, 119), 'Education': (57, 65), 'Veteran_Status': (67, 67)}
internet = ap.cluster_variables(social_var_names, social_clusters['Internet'])
language = ap.cluster_variables(social_var_names, social_clusters['Language'])
education = ap.cluster_variables(social_var_names, social_clusters['Education'])
vet_status = ap.cluster_variables(social_var_names, social_clusters['Veteran_Status'])

demo_clusters = {'Age': (3, 15), 'Gender': (0, 1), 'Race': (35, 54), 'Latino': (71, 74)}
age = ap.cluster_variables(demo_var_names, demo_clusters['Age'])
gender = ap.cluster_variables(demo_var_names, demo_clusters['Gender'])
race = ap.cluster_variables(demo_var_names, demo_clusters['Race'])

In [40]:
ohaver_census = pipe.add_census_data(ohaver_zips)

In [40]:
ohaver_health = rdd_to_data(ohaver_census, health)

In [46]:
ohaver_health_df = arr_to_pandas(ohaver_health, health)
export(ohaver_health_df, 'ohaver_health_data.csv', bucket)

In [41]:
ohaver_commute = pipe.rdd_to_data(ohaver_census, commute)

In [49]:
ohaver_commute_df = pipe.arr_to_pandas(ohaver_commute, commute)
pipe.export(ohaver_commute_df, 'ohaver_commute_data.csv', bucket)