In [1]:
from pyspark import SparkContext as sc
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.types import *
from IPython.display import display
import pandas as pd
import math, re, urllib, requests

In [2]:
sc = sc(appName="attraction")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName('attraction analysis').getOrCreate()

In [3]:
det_path = 'outputs/attraction_details'
rev_path = 'outputs/attraction_reviews'

In [4]:
det_df = spark.read.parquet(det_path)
det_df.createOrReplaceTempView('det_df')

In [5]:
display(det_df.count())
display(det_df.toPandas().head())

3637

Unnamed: 0,attraction_id,city,country,location,name,price,province,rating,category
0,914,vancouver_island,canada,nil,the_robert_bateman_centre_admission_ticket,10.5,british_columbia,-1.0,sightseeing_tickets_&_passes
1,915,vancouver_island,canada,nil,victoria_butterfly_gardens,17.33,british_columbia,-1.0,sightseeing_tickets_&_passes
2,916,kamloops,canada,"{""lat"":50.6545725,""lng"":-120.0789983}",kamloops_bc_wildlife_park_admission,12.45,british_columbia,4.5,sightseeing_tickets_&_passes
3,917,montreal,canada,nil,clip_'n_climb_laval_climbing_session_ticket,18.4,quebec,-1.0,sightseeing_tickets_&_passes
4,918,things_to_do_in_hopewell_cape,canada,nil,hopewell_rocks_admission,10.0,new_brunswick,5.0,sightseeing_tickets_&_passes


In [6]:
city_udf = functions.udf(lambda x: re.sub('things_to_do_in_','',x),StringType())
det_df = det_df.withColumn('city',city_udf(det_df.city))

In [7]:
def myround(x, base=.5):
    return float(round(x/base)*base)

avg_rat_df = spark.sql("SELECT province, category, AVG(rating) as avg_rating FROM det_df WHERE rating != -1 GROUP BY province, category")
round_udf = functions.udf(lambda x: myround(x), FloatType())
avg_rat_df = avg_rat_df.withColumn('updated_rating',round_udf(avg_rat_df.avg_rating)).drop('avg_rating')

det_df = det_df.join(avg_rat_df, ['province','category'],'left_outer').orderBy('attraction_id')
det_df = det_df.withColumn("rating", functions.when(det_df["rating"] == -1, det_df["updated_rating"]).otherwise(det_df["rating"])).drop('updated_rating')

display(det_df.toPandas().head(11))

Unnamed: 0,province,category,attraction_id,city,country,location,name,price,rating
0,british_columbia,featured_tours_and_tickets,0,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",vancouver_city_sightseeing_tour,80.0,4.5
1,british_columbia,featured_tours_and_tickets,1,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0
2,quebec,featured_tours_and_tickets,2,montreal,canada,"{""lat"":45.5001458,""lng"":-73.5720264}",quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5
3,ontario,featured_tours_and_tickets,3,toronto,canada,"{""lat"":43.6561507,""lng"":-79.3842642}",niagara_falls_day_trip_from_toronto,169.0,5.0
4,ontario,featured_tours_and_tickets,4,niagara_falls,canada,"{""lat"":43.0857136,""lng"":-79.0824311}","best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0
5,ontario,featured_tours_and_tickets,5,niagara_falls,canada,"{""lat"":43.102436,""lng"":-78.961638}",niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0
6,british_columbia,featured_tours_and_tickets,6,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",whistler_small-group_day_trip_from_vancouver,145.0,5.0
7,ontario,featured_tours_and_tickets,7,niagara_falls,canada,"{""lat"":43.0857136,""lng"":-79.0824311}",ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0
8,british_columbia,local_experiences,8,vancouver_island,canada,nil,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0
9,british_columbia,local_experiences,9,vancouver,canada,"{""lat"":49.2869235,""lng"":-123.12216}",private_tour:_vancouver_to_victoria_island,670.0,5.0


In [8]:
def find_loc(x):
    toOut = re.findall('\d+\.\d+',x)
    if len(toOut) == 0:
        return [None,None]
    else:
        return [float(x) for x in toOut]

loc_udf = functions.udf(lambda x: find_loc(x), ArrayType(FloatType()))
det_loc_df = det_df.withColumn('location', loc_udf(det_df.location)).orderBy('attraction_id')
det_loc_df = det_loc_df.withColumn('latitude',det_loc_df.location[0]).withColumn('longitude',det_loc_df.location[1]).drop('location')

display(det_loc_df.toPandas().head(11))

Unnamed: 0,province,category,attraction_id,city,country,name,price,rating,latitude,longitude
0,british_columbia,featured_tours_and_tickets,0,vancouver,canada,vancouver_city_sightseeing_tour,80.0,4.5,49.197834,123.064995
1,british_columbia,featured_tours_and_tickets,1,vancouver,canada,vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0,49.197834,123.064995
2,quebec,featured_tours_and_tickets,2,montreal,canada,quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5,45.500145,73.572029
3,ontario,featured_tours_and_tickets,3,toronto,canada,niagara_falls_day_trip_from_toronto,169.0,5.0,43.656151,79.384262
4,ontario,featured_tours_and_tickets,4,niagara_falls,canada,"best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0,43.085712,79.082428
5,ontario,featured_tours_and_tickets,5,niagara_falls,canada,niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0,43.102436,78.961639
6,british_columbia,featured_tours_and_tickets,6,vancouver,canada,whistler_small-group_day_trip_from_vancouver,145.0,5.0,49.197834,123.064995
7,ontario,featured_tours_and_tickets,7,niagara_falls,canada,ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0,43.085712,79.082428
8,british_columbia,local_experiences,8,vancouver_island,canada,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0,,
9,british_columbia,local_experiences,9,vancouver,canada,private_tour:_vancouver_to_victoria_island,670.0,5.0,49.286922,123.122162


In [9]:


det_loc_df.createOrReplaceTempView('det_loc_df')
avg_cc_loc = spark.sql("SELECT city, category, AVG(latitude) as cc_lat, AVG(longitude) as cc_lon FROM det_loc_df WHERE ISNULL(latitude) = false AND ISNULL(longitude) = false GROUP BY city, category")

det_avgloc_df = det_loc_df.join(avg_cc_loc, ['city','category'],'left_outer')
det_avgloc_df = det_avgloc_df.withColumn('latitude', functions.when(det_avgloc_df['latitude'].isNull(),det_avgloc_df['cc_lat']).otherwise(det_avgloc_df['latitude'])).withColumn('longitude', functions.when(det_avgloc_df['longitude'].isNull(),det_avgloc_df['cc_lon']).otherwise(det_avgloc_df['longitude'])).drop(det_avgloc_df['cc_lat']).drop(det_avgloc_df['cc_lon']).orderBy('attraction_id')
display(det_avgloc_df.toPandas().head(11))

Unnamed: 0,city,category,province,attraction_id,country,name,price,rating,latitude,longitude
0,vancouver,featured_tours_and_tickets,british_columbia,0,canada,vancouver_city_sightseeing_tour,80.0,4.5,49.197834,123.064995
1,vancouver,featured_tours_and_tickets,british_columbia,1,canada,vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0,49.197834,123.064995
2,montreal,featured_tours_and_tickets,quebec,2,canada,quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5,45.500145,73.572029
3,toronto,featured_tours_and_tickets,ontario,3,canada,niagara_falls_day_trip_from_toronto,169.0,5.0,43.656151,79.384262
4,niagara_falls,featured_tours_and_tickets,ontario,4,canada,"best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0,43.085712,79.082428
5,niagara_falls,featured_tours_and_tickets,ontario,5,canada,niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0,43.102436,78.961639
6,vancouver,featured_tours_and_tickets,british_columbia,6,canada,whistler_small-group_day_trip_from_vancouver,145.0,5.0,49.197834,123.064995
7,niagara_falls,featured_tours_and_tickets,ontario,7,canada,ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0,43.085712,79.082428
8,vancouver_island,local_experiences,british_columbia,8,canada,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0,,
9,vancouver,local_experiences,british_columbia,9,canada,private_tour:_vancouver_to_victoria_island,670.0,5.0,49.286922,123.122162


In [10]:
def get_loc(address, position, maps_key='AIzaSyC2jxjbR_svb9EjCeMBivCNEcCaaxdEYIA'):
    maps_api_url = 'https://maps.googleapis.com/maps/api/geocode/json'
    request_url = maps_api_url + '?' + urllib.parse.urlencode({'address':address,'key':maps_key})
    response = requests.get(request_url)
    resp_json_payload = response.json()
    out = resp_json_payload['results'][0]['geometry']['location']
    if position == 'latitude':
        return float(out['lat'])
    elif position == 'longitude':
        return float(out['lng'])

get_lat_udf = functions.udf(lambda x: get_loc(x,'latitude'), FloatType())
get_lon_udf = functions.udf(lambda x: get_loc(x,'longitude'), FloatType())
det_avgloc_df = det_avgloc_df.withColumn('latitude',functions.when(det_avgloc_df['latitude'].isNull(),get_lat_udf(det_avgloc_df['city']+','+det_avgloc_df['province'])).otherwise(det_avgloc_df['latitude']))
det_avgloc_df = det_avgloc_df.withColumn('longitude',functions.when(det_avgloc_df['longitude'].isNull(),get_lon_udf(det_avgloc_df['city']+','+det_avgloc_df['province'])).otherwise(det_avgloc_df['longitude']))
display(det_avgloc_df.toPandas().head(11))

Unnamed: 0,city,category,province,attraction_id,country,name,price,rating,latitude,longitude
0,vancouver,featured_tours_and_tickets,british_columbia,0,canada,vancouver_city_sightseeing_tour,80.0,4.5,49.197834,123.064995
1,vancouver,featured_tours_and_tickets,british_columbia,1,canada,vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0,49.197834,123.064995
2,montreal,featured_tours_and_tickets,quebec,2,canada,quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5,45.500145,73.572029
3,toronto,featured_tours_and_tickets,ontario,3,canada,niagara_falls_day_trip_from_toronto,169.0,5.0,43.656151,79.384262
4,niagara_falls,featured_tours_and_tickets,ontario,4,canada,"best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0,43.085712,79.082428
5,niagara_falls,featured_tours_and_tickets,ontario,5,canada,niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0,43.102436,78.961639
6,vancouver,featured_tours_and_tickets,british_columbia,6,canada,whistler_small-group_day_trip_from_vancouver,145.0,5.0,49.197834,123.064995
7,niagara_falls,featured_tours_and_tickets,ontario,7,canada,ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0,43.085712,79.082428
8,vancouver_island,local_experiences,british_columbia,8,canada,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0,38.825863,-76.919594
9,vancouver,local_experiences,british_columbia,9,canada,private_tour:_vancouver_to_victoria_island,670.0,5.0,49.286922,123.122162
