In [1]:
import pandas as pd
import re
import pyspark
from pyspark.sql import SQLContext, functions, types
from pyspark.ml.feature import StringIndexer
from pyspark.sql import Row
from geopy.geocoders import Nominatim

In [2]:
sc=pyspark.SparkContext(appName="project")
spark = SQLContext(sc)

In [3]:
## Reading hotel dataset scraped from TripAdvisor

h_df = pd.read_json('tripadvisor_hotel_output/hotel_info.json')
h1_df = spark.createDataFrame(h_df).cache()
h1_df.createOrReplaceTempView('h1_df')

## Removing duplicates from the hotel dataset

temp=spark.sql("SELECT df.id FROM (SELECT id, COUNT(*) as tot_count FROM h1_df GROUP BY id ORDER BY tot_count DESC) df WHERE df.tot_count>1")
temp.createOrReplaceTempView('temp')
del_dup = spark.sql("SELECT h1_df.* FROM h1_df LEFT JOIN temp ON h1_df.id == temp.id WHERE temp.id IS NULL").cache()
del_dup.createOrReplaceTempView('del_dup')

## Splitting amenities based on ',' and type casting to array of strings

del_dup = del_dup.withColumn("amenities", functions.split(del_dup["amenities"], ",").cast("array<string>"))

## Filling missing prices

prices = [float(i[0]) for i in del_dup.select("price").dropna().collect() if i[0] != 'NaN']
avg_price = sum(prices)/len(prices)
avg_price_df = del_dup.withColumn('price',functions.when(functions.isnan(functions.col("price")), functions.lit(avg_price)).otherwise(functions.col("price")))

In [6]:
## Getting city using coordinates

geolocator = Nominatim(user_agent="new_recomm", timeout=None)

def get_cname(x):
    if 'nil' in x[1:-1]:
        return "None"
    else:
        location = geolocator.reverse(x[1:-1], timeout=None)
        if 'city' in location.raw["address"]:
            return location.raw["address"]["city"]
        elif 'town' in location.raw["address"]:
            return location.raw["address"]["town"]
        else:
            return "None"
    
get_city = functions.udf(lambda a:get_cname(a),types.StringType())

city_df = avg_price_df.withColumn("city",get_city(functions.col("location"))).cache()

## Saving etled dataset

city_df.createOrReplaceTempView('del_dup')
city_df.coalesce(4).write.json('etl/del_dup',mode='overwrite')

In [7]:
## Explode amenities to make predictions based on length of amentities provided by user

newh_df  = spark.sql("SELECT id,explode(amenities) as amenities FROM del_dup")

##  Removing punctuations from amenities column

strip_udf = functions.udf(lambda x: re.sub(r'[^\w\s]','',x), types.StringType())
newh_df = newh_df.withColumn("amenities", strip_udf(functions.col("amenities")))
newh_df.createOrReplaceTempView('newh_df')
newh_df.coalesce(4).write.json('etl/newh_df',mode='overwrite')

In [8]:
df = pd.read_json('tripadvisor_hotel_output/reviews.json')

df["att_id"]=df.id.astype('category').cat.codes

rev_df = spark.createDataFrame(df).cache()
rev_df.createOrReplaceTempView('rev_df')

rev_temp=spark.sql("SELECT df.id FROM (SELECT id, COUNT(*) as tot_count FROM rev_df GROUP BY id ORDER BY tot_count DESC) df WHERE df.tot_count>1")
rev_temp.createOrReplaceTempView('rev_temp')

s_df = spark.sql("SELECT rev_df.* FROM rev_df LEFT JOIN rev_temp ON rev_df.id == rev_temp.id WHERE rev_temp.id IS NULL")
s_df.createOrReplaceTempView('s_df')


## String Indexing user_name 

indexer = StringIndexer(inputCol="user_name", outputCol="user_id")
indexed = indexer.fit(s_df).transform(s_df)
u_id_df = indexed.withColumn("user_id",indexed["user_id"].cast("Int")).cache()
u_id_df.createOrReplaceTempView('u_id_df')
u_id_df.coalesce(4).write.json('etl/u_id_df',mode='overwrite')