In [21]:
from pyspark import SparkContext as sc
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.types import *
from IPython.display import display
import pandas as pd
import math, re, urllib, requests
from datetime import datetime as dt

In [2]:
sc = sc(appName="attraction")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName('attraction analysis').getOrCreate()

In [3]:
det_path = 'outputs/attraction_details'
rev_path = 'outputs/attraction_reviews'

In [4]:
det_df = spark.read.parquet(det_path)
det_df.createOrReplaceTempView('det_df')

In [5]:
display(det_df.count())
display(det_df.orderBy('attraction_id').toPandas().head(11))

3637

Unnamed: 0,attraction_id,city,country,location,name,price,province,rating,category
0,0,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",vancouver_city_sightseeing_tour,80.0,british_columbia,4.5,featured_tours_and_tickets
1,1,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",vancouver_to_victoria_and_butchart_gardens_tou...,210.0,british_columbia,5.0,featured_tours_and_tickets
2,2,montreal,canada,"{""lat"":45.5001458,""lng"":-73.5720264}",quebec_city_and_montmorency_falls_day_trip_fro...,115.0,quebec,4.5,featured_tours_and_tickets
3,3,toronto,canada,"{""lat"":43.6561507,""lng"":-79.3842642}",niagara_falls_day_trip_from_toronto,169.0,ontario,5.0,featured_tours_and_tickets
4,4,niagara_falls,canada,"{""lat"":43.0857136,""lng"":-79.0824311}","best_of_niagara_falls_tour_from_niagara_falls,...",158.0,ontario,5.0,featured_tours_and_tickets
5,5,niagara_falls,canada,"{""lat"":43.102436,""lng"":-78.961638}",niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,ontario,5.0,featured_tours_and_tickets
6,6,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",whistler_small-group_day_trip_from_vancouver,145.0,british_columbia,5.0,featured_tours_and_tickets
7,7,niagara_falls,canada,"{""lat"":43.0857136,""lng"":-79.0824311}",ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,ontario,5.0,featured_tours_and_tickets
8,8,vancouver_island,canada,nil,"local_food,_craft_beverage_and_estate_winery_t...",150.0,british_columbia,5.0,local_experiences
9,9,vancouver,canada,"{""lat"":49.2869235,""lng"":-123.12216}",private_tour:_vancouver_to_victoria_island,670.0,british_columbia,5.0,local_experiences


In [6]:
city_udf = functions.udf(lambda x: re.sub('things_to_do_in_','',x),StringType())
det_df = det_df.withColumn('city',city_udf(det_df.city))

In [7]:
def myround(x, base=.5):
    return float(round(x/base)*base)

avg_rat_df = spark.sql("SELECT province, category, AVG(rating) as avg_rating FROM det_df WHERE rating != -1 GROUP BY province, category")
round_udf = functions.udf(lambda x: myround(x), FloatType())
avg_rat_df = avg_rat_df.withColumn('updated_rating',round_udf(avg_rat_df.avg_rating)).drop('avg_rating')

det_df = det_df.join(avg_rat_df, ['province','category'],'left_outer').orderBy('attraction_id')
det_df = det_df.withColumn("rating", functions.when(det_df["rating"] == -1, det_df["updated_rating"]).otherwise(det_df["rating"])).drop('updated_rating')

display(det_df.count())
display(det_df.toPandas().head(11))

3637

Unnamed: 0,province,category,attraction_id,city,country,location,name,price,rating
0,british_columbia,featured_tours_and_tickets,0,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",vancouver_city_sightseeing_tour,80.0,4.5
1,british_columbia,featured_tours_and_tickets,1,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0
2,quebec,featured_tours_and_tickets,2,montreal,canada,"{""lat"":45.5001458,""lng"":-73.5720264}",quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5
3,ontario,featured_tours_and_tickets,3,toronto,canada,"{""lat"":43.6561507,""lng"":-79.3842642}",niagara_falls_day_trip_from_toronto,169.0,5.0
4,ontario,featured_tours_and_tickets,4,niagara_falls,canada,"{""lat"":43.0857136,""lng"":-79.0824311}","best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0
5,ontario,featured_tours_and_tickets,5,niagara_falls,canada,"{""lat"":43.102436,""lng"":-78.961638}",niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0
6,british_columbia,featured_tours_and_tickets,6,vancouver,canada,"{""lat"":49.1978322,""lng"":-123.0649959}",whistler_small-group_day_trip_from_vancouver,145.0,5.0
7,ontario,featured_tours_and_tickets,7,niagara_falls,canada,"{""lat"":43.0857136,""lng"":-79.0824311}",ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0
8,british_columbia,local_experiences,8,vancouver_island,canada,nil,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0
9,british_columbia,local_experiences,9,vancouver,canada,"{""lat"":49.2869235,""lng"":-123.12216}",private_tour:_vancouver_to_victoria_island,670.0,5.0


In [8]:
def find_loc(x):
    toOut = re.findall('[+,-]\d+\.\d+',x)
    if len(toOut) == 0:
        return [None,None]
    else:
        return [float(x) for x in toOut]

loc_udf = functions.udf(lambda x: find_loc(x), ArrayType(FloatType()))
det_loc_df = det_df.withColumn('location', loc_udf(det_df.location)).orderBy('attraction_id')
det_loc_df = det_loc_df.withColumn('latitude',det_loc_df.location[0]).withColumn('longitude',det_loc_df.location[1]).drop('location')

display(det_loc_df.count())
display(det_loc_df.toPandas().head(11))

3637

Unnamed: 0,province,category,attraction_id,city,country,name,price,rating,latitude,longitude
0,british_columbia,featured_tours_and_tickets,0,vancouver,canada,vancouver_city_sightseeing_tour,80.0,4.5,-123.064995,
1,british_columbia,featured_tours_and_tickets,1,vancouver,canada,vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0,-123.064995,
2,quebec,featured_tours_and_tickets,2,montreal,canada,quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5,-73.572029,
3,ontario,featured_tours_and_tickets,3,toronto,canada,niagara_falls_day_trip_from_toronto,169.0,5.0,-79.384262,
4,ontario,featured_tours_and_tickets,4,niagara_falls,canada,"best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0,-79.082428,
5,ontario,featured_tours_and_tickets,5,niagara_falls,canada,niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0,-78.961639,
6,british_columbia,featured_tours_and_tickets,6,vancouver,canada,whistler_small-group_day_trip_from_vancouver,145.0,5.0,-123.064995,
7,ontario,featured_tours_and_tickets,7,niagara_falls,canada,ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0,-79.082428,
8,british_columbia,local_experiences,8,vancouver_island,canada,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0,,
9,british_columbia,local_experiences,9,vancouver,canada,private_tour:_vancouver_to_victoria_island,670.0,5.0,-123.122162,


In [9]:
det_loc_df.createOrReplaceTempView('det_loc_df')
avg_cc_loc = spark.sql("SELECT city, category, AVG(latitude) as cc_lat, AVG(longitude) as cc_lon FROM det_loc_df WHERE ISNULL(latitude) = false AND ISNULL(longitude) = false GROUP BY city, category")

det_avgloc_df = det_loc_df.join(avg_cc_loc, ['city','category'],'left_outer')
det_avgloc_df = det_avgloc_df.withColumn('latitude', functions.when(det_avgloc_df['latitude'].isNull(),det_avgloc_df['cc_lat']).otherwise(det_avgloc_df['latitude'])).withColumn('longitude', functions.when(det_avgloc_df['longitude'].isNull(),det_avgloc_df['cc_lon']).otherwise(det_avgloc_df['longitude'])).drop(det_avgloc_df['cc_lat']).drop(det_avgloc_df['cc_lon']).orderBy('attraction_id')

display(det_avgloc_df.count())
display(det_avgloc_df.toPandas().head(11))

3637

Unnamed: 0,city,category,province,attraction_id,country,name,price,rating,latitude,longitude
0,vancouver,featured_tours_and_tickets,british_columbia,0,canada,vancouver_city_sightseeing_tour,80.0,4.5,-123.064995,
1,vancouver,featured_tours_and_tickets,british_columbia,1,canada,vancouver_to_victoria_and_butchart_gardens_tou...,210.0,5.0,-123.064995,
2,montreal,featured_tours_and_tickets,quebec,2,canada,quebec_city_and_montmorency_falls_day_trip_fro...,115.0,4.5,-73.572029,
3,toronto,featured_tours_and_tickets,ontario,3,canada,niagara_falls_day_trip_from_toronto,169.0,5.0,-79.384262,
4,niagara_falls,featured_tours_and_tickets,ontario,4,canada,"best_of_niagara_falls_tour_from_niagara_falls,...",158.0,5.0,-79.082428,
5,niagara_falls,featured_tours_and_tickets,ontario,5,canada,niagara_falls_in_one_day:_deluxe_sightseeing_t...,204.42,5.0,-78.961639,
6,vancouver,featured_tours_and_tickets,british_columbia,6,canada,whistler_small-group_day_trip_from_vancouver,145.0,5.0,-123.064995,
7,niagara_falls,featured_tours_and_tickets,ontario,7,canada,ultimate_niagara_falls_tour_plus_helicopter_ri...,317.42,5.0,-79.082428,
8,vancouver_island,local_experiences,british_columbia,8,canada,"local_food,_craft_beverage_and_estate_winery_t...",150.0,5.0,,
9,vancouver,local_experiences,british_columbia,9,canada,private_tour:_vancouver_to_victoria_island,670.0,5.0,-123.122162,


In [10]:
def get_loc(address, position, maps_key='AIzaSyC2jxjbR_svb9EjCeMBivCNEcCaaxdEYIA'):
    maps_api_url = 'https://maps.googleapis.com/maps/api/geocode/json'
    request_url = maps_api_url + '?' + urllib.parse.urlencode({'address':address,'key':maps_key})
    response = requests.get(request_url)
    resp_json_payload = response.json()
    out = resp_json_payload['results'][0]['geometry']['location']
    if position == 'latitude':
        return float(out['lat'])
    elif position == 'longitude':
        return float(out['lng'])

get_lat_udf = functions.udf(lambda x: get_loc(x,'latitude'), FloatType())
get_lon_udf = functions.udf(lambda x: get_loc(x,'longitude'), FloatType())
det_avgloc_df = det_avgloc_df.withColumn('latitude',functions.when(det_avgloc_df['latitude'].isNull(),get_lat_udf(det_avgloc_df['city']+','+det_avgloc_df['province'])).otherwise(det_avgloc_df['latitude']))
det_avgloc_df = det_avgloc_df.withColumn('longitude',functions.when(det_avgloc_df['longitude'].isNull(),get_lon_udf(det_avgloc_df['city']+','+det_avgloc_df['province'])).otherwise(det_avgloc_df['longitude']))

display(det_avgloc_df.count())

3637

In [12]:
det_avgloc_df.write.parquet('etl/attractions.json',mode='overwrite')

In [39]:
rev_df = spark.read.parquet(rev_path).repartition(160)
display(rev_df.count())
display(rev_df.show())

33925

+-------------+------+--------------------+-----------------+--------------+
|attraction_id|rating|              review|      review_date|          user|
+-------------+------+--------------------+-----------------+--------------+
|         1202|   5.0|Wicked fun!. My f...|  October 6, 2018|     coasttime|
|         1316|   5.0|An Experience Not...|    June 11, 2018|      rkleinpa|
|         1305|   5.0|Awesome Niagara. ...| November 3, 2017|     lindsay_o|
|         1930|   5.0|Great tour!. The ...|     June 1, 2018|drummercutie04|
|          568|   5.0|Delicious way to ...|   August 5, 2018|    sunmum2014|
|          998|   5.0|Snowmobile tour w...|     May 12, 2018|    trek401613|
|          946|   5.0|Icewalk. Had a gr...|February 20, 2019|       allen_c|
|         1314|   2.0|Timing is everyth...|  October 2, 2018|         kay_s|
|           48|   5.0|Fabulous . We had...| October 18, 2018|       avranga|
|          145|   5.0|make it a Muskoka...|September 2, 2018|  facebook1951|

None

In [40]:
def convert_date(ip_date):
    op_date = dt.strptime(ip_date, "%B %d, %Y").strftime("%d-%m-%Y")
    return op_date

convert_df_udf = functions.udf(lambda x: convert_date(x),StringType())
rev_df = rev_df.withColumn('review_date',convert_df_udf(rev_df['review_date']))
rev_df.createOrReplaceTempView('rev_df')

In [41]:
rev_df.show()

+-------------+------+--------------------+-----------+--------------+
|attraction_id|rating|              review|review_date|          user|
+-------------+------+--------------------+-----------+--------------+
|         1202|   5.0|Wicked fun!. My f...| 06-10-2018|     coasttime|
|         1316|   5.0|An Experience Not...| 11-06-2018|      rkleinpa|
|         1305|   5.0|Awesome Niagara. ...| 03-11-2017|     lindsay_o|
|         1930|   5.0|Great tour!. The ...| 01-06-2018|drummercutie04|
|          568|   5.0|Delicious way to ...| 05-08-2018|    sunmum2014|
|          998|   5.0|Snowmobile tour w...| 12-05-2018|    trek401613|
|          946|   5.0|Icewalk. Had a gr...| 20-02-2019|       allen_c|
|         1314|   2.0|Timing is everyth...| 02-10-2018|         kay_s|
|           48|   5.0|Fabulous . We had...| 18-10-2018|       avranga|
|          145|   5.0|make it a Muskoka...| 02-09-2018|  facebook1951|
|          242|   5.0|Very knowledgeabl...| 28-01-2019| curious190484|
|     

In [46]:
user_rev_count = spark.sql("SELECT user, COUNT(*) as rev_count FROM rev_df GROUP BY user ORDER BY rev_count DESC")
user_rev_count.show()

+----------+---------+
|      user|rev_count|
+----------+---------+
|    paul_s|       29|
|    john_m|       28|
|    dave_m|       21|
|    mike_m|       20|
|  kellyadl|       20|
|    paul_m|       19|
|   david_t|       19|
|   david_m|       18|
|  robert_m|       18|
|   linda_s|       18|
|  andrew_m|       18|
|     bob_c|       18|
|  robert_b|       17|
|    emma_l|       16|
|   74jeffz|       15|
|   diane_o|       15|
| michael_h|       15|
|jennifer_r|       14|
|     ian_h|       14|
|     jim_l|       14|
+----------+---------+
only showing top 20 rows



In [49]:
print( "Reviews are available for {att_no} attractions.".format(att_no = len(rev_df.select('attraction_id').distinct().collect())))
print( "Matrix will be higly sparse as the maximum number of reviews provided by an user is {val}.".format(val=user_rev_count.select('rev_count').limit(1).collect()[0][0]))

Reviews are available for 1619 attractions.
Matrix will be higly sparse as the maximum number of reviews provided by an user is 29


In [54]:
user_rev_count = user_rev_count.withColumn('user_id', functions.monotonically_increasing_id())
rev_etled = rev_df.join(user_rev_count.drop('rev_count'),'user')

In [57]:
user_rev_count.orderBy('user_id').show()

+----------+---------+-------+
|      user|rev_count|user_id|
+----------+---------+-------+
|    paul_s|       29|      0|
|    john_m|       28|      1|
|    dave_m|       21|      2|
|    mike_m|       20|      3|
|  kellyadl|       20|      4|
|    paul_m|       19|      5|
|   david_t|       19|      6|
|  robert_m|       18|      7|
|  andrew_m|       18|      8|
|     bob_c|       18|      9|
|   david_m|       18|     10|
|   linda_s|       18|     11|
|  robert_b|       17|     12|
|    emma_l|       16|     13|
| michael_h|       15|     14|
|   74jeffz|       15|     15|
|   diane_o|       15|     16|
|     ian_h|       14|     17|
|     jim_l|       14|     18|
|jennifer_r|       14|     19|
+----------+---------+-------+
only showing top 20 rows

