In [None]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SQLContext, functions, types
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql import Row

In [2]:
sc=pyspark.SparkContext(appName="project")
spark = SQLContext(sc)

In [3]:
h_df = pd.read_json('dataset/hotel_info.json')
h1_df = spark.createDataFrame(h_df)
h1_df.createOrReplaceTempView('h1_df')

temp=spark.sql("SELECT df.id FROM (SELECT id, COUNT(*) as tot_count FROM h1_df GROUP BY id ORDER BY tot_count DESC) df WHERE df.tot_count>1")
temp.createOrReplaceTempView('temp')
del_dup = spark.sql("SELECT h1_df.* FROM h1_df LEFT JOIN temp ON h1_df.id == temp.id WHERE temp.id IS NULL")
del_dup.createOrReplaceTempView('del_dup')

del_dup = del_dup.withColumn("amenities", functions.split(del_dup["amenities"], ",").cast("array<string>"))
del_dup.createOrReplaceTempView('del_dup')

newh_df  = spark.sql("SELECT id,explode(amenities) as amenities FROM del_dup")
newh_df.createOrReplaceTempView('newh_df')

newh1_df  = spark.sql("SELECT amenities,COUNT(amenities) AS tot_count FROM newh_df GROUP BY amenities ORDER BY tot_count DESC")
newh1_df.createOrReplaceTempView('newh1_df')
# del_dup.show()

In [7]:
amenities_pref = [" 'Non-smoking hotel'"," 'Air conditioning'"," 'Free parking'"," 'Pets Allowed ( Dog / Pet Friendly )'"," 'Free High Speed Internet (WiFi)'"]

pa_df = pd.DataFrame(amenities_pref,columns=["amenities_pref"])

a_df = spark.createDataFrame(pa_df)
a_df.createOrReplaceTempView('a_df')

newa_df  = spark.sql("SELECT * FROM newh_df INNER JOIN a_df WHERE newh_df.amenities=a_df.amenities_pref")

ameni_comb = newa_df.groupBy(functions.col("id")).agg(functions.collect_list(functions.col("amenities")).alias("amenities"))
amenities_len=ameni_comb.withColumn("ameni_len",functions.size(ameni_comb["amenities"])).orderBy(functions.col("ameni_len"), ascending=False)
amenities_len.createOrReplaceTempView("amenities_len")

ameni_df = spark.sql("SELECT a.id,h.amenities,a.ameni_len FROM del_dup h INNER JOIN amenities_len a WHERE h.id=a.id ORDER BY a.ameni_len DESC")
ameni_df.show()

+----+--------------------+---------+
|  id|           amenities|ameni_len|
+----+--------------------+---------+
| 155|[['Pool',  'Free ...|        5|
|5846|[['Pool',  'Free ...|        5|
|1882|[['Restaurant',  ...|        5|
|5871|[['Restaurant',  ...|        5|
|1241|[['Pool',  'Free ...|        5|
|5208|[['Pool',  'Free ...|        5|
|5915|[['Pool',  'Room ...|        5|
|4161|[['Pool',  'Free ...|        5|
|1480|[['Pool',  'Free ...|        5|
|1711|[['Pool',  'Free ...|        5|
|2489|[['Pool',  'Free ...|        5|
|2906|[['Pool',  'Free ...|        5|
|  34|[['Pool',  'Free ...|        5|
| 385|[['Pool',  'Free ...|        5|
|3069|[['Pool',  'Free ...|        5|
|3061|[['Restaurant',  ...|        5|
|1055|[['Pool',  'Free ...|        5|
|4519|[['Pool',  'Room ...|        5|
| 938|[['Restaurant',  ...|        5|
|  22|[['Pool',  'Room ...|        5|
+----+--------------------+---------+
only showing top 20 rows



In [8]:
def get_rating(x):
    val = x / 5
    if x >= 0 and x <= val:
        return 1
    elif x > val and x <= 2*val:
        return 2
    elif x > 2*val and x <= 3*val:
        return 3
    elif x > 3*val and x <= 4*val:
        return 4
    else:
        return 5

find_rating = functions.udf(lambda a: get_rating(a), types.IntegerType())

usr_rating = ameni_df.withColumn("rating",find_rating("ameni_len"))
usr_rating.show()

+----+--------------------+---------+------+
|  id|           amenities|ameni_len|rating|
+----+--------------------+---------+------+
|2983|[['Pool',  'Free ...|        5|     5|
|2489|[['Pool',  'Free ...|        5|     5|
|5846|[['Pool',  'Free ...|        5|     5|
|1480|[['Pool',  'Free ...|        5|     5|
|1711|[['Pool',  'Free ...|        5|     5|
|5915|[['Pool',  'Room ...|        5|     5|
|1241|[['Pool',  'Free ...|        5|     5|
| 155|[['Pool',  'Free ...|        5|     5|
|4161|[['Pool',  'Free ...|        5|     5|
|5871|[['Restaurant',  ...|        5|     5|
|  22|[['Pool',  'Room ...|        5|     5|
|3061|[['Restaurant',  ...|        5|     5|
|2906|[['Pool',  'Free ...|        5|     5|
| 385|[['Pool',  'Free ...|        5|     5|
|5208|[['Pool',  'Free ...|        5|     5|
| 938|[['Restaurant',  ...|        5|     5|
|1882|[['Restaurant',  ...|        5|     5|
|4519|[['Pool',  'Room ...|        5|     5|
|1055|[['Pool',  'Free ...|        5|     5|
|3069|[['P

In [11]:
df = pd.read_json('dataset/reviews.json')
# df=df.iloc[:500,:]

# df["user_id"]=df.user_name.astype("category").cat.codes
df["att_id"]=df.id.astype('category').cat.codes
# df.head()

rev_df = spark.createDataFrame(df)
rev_df.createOrReplaceTempView('rev_df')

rev_temp=spark.sql("SELECT df.id FROM (SELECT id, COUNT(*) as tot_count FROM rev_df GROUP BY id ORDER BY tot_count DESC) df WHERE df.tot_count>1")
rev_temp.createOrReplaceTempView('rev_temp')

s_df = spark.sql("SELECT rev_df.* FROM rev_df LEFT JOIN rev_temp ON rev_df.id == rev_temp.id WHERE rev_temp.id IS NULL")
s_df.createOrReplaceTempView('s_df')

# s_df.select("user_name").distinct().count()

#String Indexing user_name 
indexer = StringIndexer(inputCol="user_name", outputCol="user_id")
indexed = indexer.fit(s_df).transform(s_df)
u_id_df = indexed.withColumn("user_id",indexed["user_id"].cast("Int"))
u_id_df.createOrReplaceTempView('u_id_df')

uid_count = u_id_df.select("user_id").distinct().count()

# String Indexing id 
# str_in = StringIndexer(inputCol="id", outputCol="att_id",stringOrderType='frequencyAsc')
# att_indexed = str_in.fit(u_id_df).transform(u_id_df)
# att_id_df = att_indexed.withColumn("att_id",indexed["id"].cast("Int"))
# att_id_df.createOrReplaceTempView('att_id_df')

+----+------+
|  id|att_id|
+----+------+
| 265|   202|
| 315|   233|
| 340|   244|
| 460|   332|
| 480|   347|
| 830|   613|
| 835|   617|
| 845|   623|
| 955|   694|
|1210|   873|
|1215|   875|
|1275|   925|
|1301|   946|
|1308|   951|
|1383|  1008|
|1445|  1040|
|1625|  1166|
|1790|  1312|
|1815|  1334|
|1980|  1459|
+----+------+
only showing top 20 rows



In [14]:
usrid_df = usr_rating.withColumn("usr_id", functions.lit(uid_count))
# usrid_df.show()

# comb_df = u_id_df.union(usrid_df)
# comb_df.show()

In [None]:
(training,test)=s_df.randomSplit([0.8,0.2])
training.show()

In [None]:
ranks=[4,8,12]
error = 20000
for i in ranks:
    als = ALS(maxIter=5,regParam=0.01,rank=i,userCol="user_id",itemCol="att_id",ratingCol="user_rating",coldStartStrategy="drop")
    model = als.fit(training)
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",labelCol="user_rating",predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    if rmse < error:
        model.write().overwrite().save("model_file")
        print("rank : ",i)
        error = rmse        
print("RMSE:" +str(rmse))

In [None]:
s_df.createOrReplaceTempView('s_df')
new_df  = spark.sql("SELECT DISTINCT att_id FROM s_df WHERE user_id != 13").withColumn('user_id', functions.lit(13))

als_model = ALSModel.load("model_file")
als_model.recommendForAllUsers(1).show()
# predictions = als_model.transform(new_df)
# predictions.show()
# als_model.rank

In [None]:
new_df.show()