In [36]:
import toLog
log = toLog.log('Feature extraction starting')

In [37]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName('Python spark')
         .config('spark.some.config.option','some-value')
         .getOrCreate()
         )

In [38]:
# read file from hdfs and infer schema
df_cleaned = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_cleaned.csv", header = True, inferSchema = True)
df_cleaned.printSchema()

root
 |-- id: long (nullable = true)
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- Hotel_Country: string (nullable = true)



In [43]:
df_cleaned.rdd.filter(lambda x: x['Review'] is None).count()

0

In [39]:
# register table (if not exists)
try:
    df_cleaned.createTempView('hotels')
except:
    pass

In [40]:
# selecting only the reviews using spark.sql
df_hotels = spark.sql("SELECT Hotel_Name, Review, lat, lng FROM hotels")
df_hotels.show()

+--------------------+--------------------+----------+----------+
|          Hotel_Name|              Review|       lat|       lng|
+--------------------+--------------------+----------+----------+
|         Hotel Arena|the staff in the ...|52.3605759| 4.9159683|
|         Hotel Arena|the overall hotel...|52.3605759| 4.9159683|
|    K K Hotel George|no bad experience...|51.4918878|-0.1949706|
|    K K Hotel George|the room is bit s...|51.4918878|-0.1949706|
|Apex Temple Court...|my son and his fa...|51.5137335|-0.1087512|
|Apex Temple Court...|nothing to improv...|51.5137335|-0.1087512|
|Apex Temple Court...|my self and my wi...|51.5137335|-0.1087512|
|Apex Temple Court...|expensive but thi...|51.5137335|-0.1087512|
|Apex Temple Court...|nothing not to li...|51.5137335|-0.1087512|
|Apex Temple Court...|rooms are comfort...|51.5137335|-0.1087512|
|The Park Grand Lo...|a socket for plug...|51.5142184|-0.1809032|
|The Park Grand Lo...|staff very friend...|51.5142184|-0.1809032|
|The Park 

In [41]:
print(df_hotels.count())

515500

In [6]:
rddHotels = df_hotels.select('Hotel_Name', 'Review', 'lat', 'lng').rdd

# Features extraction

In [7]:
from getTriples import getTriples

In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [9]:
def splitTriple(record):
    res = []
    for tripla in record[1]:
        res.append((record[0],tripla))
        
    return res

In [None]:
(rddHotels
 .filter(lambda x: x['Review'] is not None)
 .map(lambda x: (x['Hotel_Name'], getTriples(x['Review'])))
  .flatMap(splitTriple)
).take(40)

In [None]:
log.toLog('starting triples extraction')

In [10]:
SAMPLE_PERC = 0.001
df_features = (rddHotels
 #.sample(False, SAMPLE_PERC) 
 .filter(lambda x: x['Review'] is not None)
 .map(lambda x: (x['Hotel_Name'], getTriples(x['Review'])))
 .flatMap(splitTriple)
 .map(lambda x:( x[0], x[1][0], vader.polarity_scores(" ".join(x[1]))['compound'] )  )
).toDF(["hotel", "feature", "scores"])

In [11]:
# register table (if not exists)
try:
    df_features.createTempView('features')
except:
    spark.catalog.dropTempView('features')
    df_features.createTempView('features')

In [12]:
spark.sql("SELECT hotel, feature, scores from features").show()

+--------------------+-----------+-------+
|               hotel|    feature| scores|
+--------------------+-----------+-------+
|DoubleTree by Hil...|        bed|-0.1027|
|The Mandeville Hotel|   standard|    0.0|
|The Mandeville Hotel|      staff| 0.5859|
|The Mandeville Hotel|conserierge|    0.0|
|The Mandeville Hotel|      asset|  0.743|
|Best Western Plus...|   location| 0.6249|
|Best Western Plus...|      rooms| 0.5859|
|Shangri La Hotel ...|  furniture| 0.5719|
|Royal Hotel Champ...|    options| -0.296|
|Royal Hotel Champ...|    cutlery|    0.0|
|Intercontinental ...|        bar|    0.0|
|Acad mie H tel Sa...|    nothing|-0.1511|
|Acad mie H tel Sa...|      staff| 0.4215|
|Acad mie H tel Sa...|     dinner|    0.0|
|The Premier Notti...|   bathroom|    0.0|
|The Premier Notti...|        bit|    0.0|
|The Premier Notti...|        bed| 0.5106|
|The Premier Notti...|       room| 0.4215|
|The Premier Notti...|      stuff| 0.4754|
|DoubleTree by Hil...|        pre|    0.0|
+----------

In [13]:
spark.sql("SELECT hotel, feature, \
AVG(scores) as avg_scores, COUNT(scores) as n_scores \
FROM features GROUP BY hotel, feature \
ORDER BY hotel, avg_scores  ").show(40)

+--------------------+------------+----------+--------+
|               hotel|     feature|avg_scores|n_scores|
+--------------------+------------+----------+--------+
|                  41|  everything|       0.0|       1|
|                  41|      access|       0.0|       1|
|                  41|          tv|       0.0|       1|
|                  41|        home|       0.0|       1|
|Acad mie H tel Sa...|     nothing|   -0.1511|       1|
|Acad mie H tel Sa...|      dinner|       0.0|       1|
|Acad mie H tel Sa...|       staff|    0.4215|       1|
|    Acevi Villarroel|         caf|       0.0|       1|
|    Acevi Villarroel|         day|       0.0|       1|
|    Acevi Villarroel|         spa|    0.5719|       1|
|  Aloft London Excel|        room|       0.0|       1|
|  Aloft London Excel|        feel|       0.0|       1|
|  Aloft London Excel|    chargers|    0.2732|       1|
|  Aloft London Excel|connectivity|    0.4404|       1|
|    Amadi Park Hotel|        room|       0.0|  

In [14]:
df_features.count()

1724

# Defining categories using Word2Vec

In [None]:
log.toLog('starting Word2Vec')

In [15]:
from pyspark.mllib.feature import Word2Vec

In [19]:
rdd_tokens = (df_cleaned.select("Review").rdd
              #.sample(False,0.1)
              .filter(lambda x: x['Review'] is not None)
              .map(lambda x: x['Review'].split(" "))
             )

In [20]:
word2Vec = Word2Vec().setMinCount(50).setVectorSize(200).setWindowSize(4)

In [21]:
model = word2Vec.fit(rdd_tokens)

In [22]:
categories = ['breakfast', 'staff', 'room', 'internet', 'location', 'bath', 'food']
#food(breakfast), #staff(service), #room, #internet(wi-fi), #location, #bath

In [23]:
all_categories = {cat:dict(model.findSynonyms(cat, num = 20)) for cat in categories}

In [24]:
from pprint import pprint

In [25]:
pprint(all_categories)

{'bath': {'bathtub': 0.7747597136378125,
          'chair': 0.7057696238180915,
          'curtain': 0.8221979716434152,
          'drain': 0.69020862582995,
          'gel': 0.7941029686718208,
          'hairdryer': 0.7072194399577826,
          'leaked': 0.724425893579412,
          'microwave': 0.6793925850183672,
          'mirror': 0.7003404123037882,
          'rain': 0.7740943176797076,
          'screen': 0.7764742030491375,
          'separate': 0.6809073623380965,
          'shower': 0.7346343788554097,
          'sink': 0.7451760409257452,
          'soap': 0.6941982680161047,
          'toilet': 0.7038944708923538,
          'toiletries': 0.6886999559916872,
          'towel': 0.7442922735865374,
          'tub': 0.8081234815377862,
          'wardrobe': 0.6739209747884857},
 'breakfast': {'ambiance': 0.4006449847052091,
               'breakfast.': 0.42801991514506854,
               'breakfasts': 0.4602912354652437,
               'buffet': 0.4354548126171054,
          

In [27]:
import json

with open('categories.json', 'w') as outfile:
    json.dump(all_categories, outfile)

In [28]:
for category, dict_feat in all_categories.items():
    copyOfDict = dict(dict_feat)
    for word, similarity in copyOfDict.items():
        if similarity < 0.6:
            del dict_feat[word] 
    
pprint(all_categories)  

{'bath': {'bathtub': 0.7747597136378125,
          'chair': 0.7057696238180915,
          'curtain': 0.8221979716434152,
          'drain': 0.69020862582995,
          'gel': 0.7941029686718208,
          'hairdryer': 0.7072194399577826,
          'leaked': 0.724425893579412,
          'microwave': 0.6793925850183672,
          'mirror': 0.7003404123037882,
          'rain': 0.7740943176797076,
          'screen': 0.7764742030491375,
          'separate': 0.6809073623380965,
          'shower': 0.7346343788554097,
          'sink': 0.7451760409257452,
          'soap': 0.6941982680161047,
          'toilet': 0.7038944708923538,
          'toiletries': 0.6886999559916872,
          'towel': 0.7442922735865374,
          'tub': 0.8081234815377862,
          'wardrobe': 0.6739209747884857},
 'breakfast': {'food': 0.6664242862539209},
 'food': {'breakfast': 0.666424361023118,
          'dinner': 0.6190873327705948,
          'menu': 0.7152232764402012,
          'quality': 0.61186010288234

In [None]:
all_categories

In [None]:
#eliminare duplicati tra categorie differenti

In [29]:
def assign_categories(feat):
    for cat,dict_feat in all_categories.items():
        if feat == cat or feat in dict_feat  :
            return cat
    
    return 'other'
    

In [31]:
df_categories = (df_features.rdd
                 .map(lambda x: (x['hotel'],x['feature'],assign_categories(x['feature']), x['scores'] ))
                 .toDF(['hotel', 'feature', 'categories', 'score'])
                )

In [32]:
df_categories.head(10)

[Row(hotel='DoubleTree by Hilton London Islington', feature='bed', categories='other', score=-0.1027),
 Row(hotel='The Mandeville Hotel', feature='standard', categories='other', score=0.0),
 Row(hotel='The Mandeville Hotel', feature='staff', categories='staff', score=0.5859),
 Row(hotel='The Mandeville Hotel', feature='conserierge', categories='other', score=0.0),
 Row(hotel='The Mandeville Hotel', feature='asset', categories='other', score=0.743),
 Row(hotel='Best Western Plus Hotel Galles', feature='location', categories='location', score=0.6249),
 Row(hotel='Best Western Plus Hotel Galles', feature='rooms', categories='other', score=0.5859),
 Row(hotel='Shangri La Hotel at The Shard London', feature='furniture', categories='other', score=0.5719),
 Row(hotel='Royal Hotel Champs Elys es', feature='options', categories='other', score=-0.296),
 Row(hotel='Royal Hotel Champs Elys es', feature='cutlery', categories='other', score=0.0)]

In [None]:
df_categories.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_features.csv", header = True)

In [33]:
# register table (if not exists)
try:
    df_categories.createTempView('categories')
except:
    spark.catalog.dropTempView('categories')
    df_categories.createTempView('categories')

In [34]:
spark.sql("SELECT hotel, categories, feature, score from categories").show()

+--------------------+----------+-----------+-------+
|               hotel|categories|    feature|  score|
+--------------------+----------+-----------+-------+
|DoubleTree by Hil...|     other|        bed|-0.1027|
|The Mandeville Hotel|     other|   standard|    0.0|
|The Mandeville Hotel|     staff|      staff| 0.5859|
|The Mandeville Hotel|     other|conserierge|    0.0|
|The Mandeville Hotel|     other|      asset|  0.743|
|Best Western Plus...|  location|   location| 0.6249|
|Best Western Plus...|     other|      rooms| 0.5859|
|Shangri La Hotel ...|     other|  furniture| 0.5719|
|Royal Hotel Champ...|     other|    options| -0.296|
|Royal Hotel Champ...|     other|    cutlery|    0.0|
|Intercontinental ...|     other|        bar|    0.0|
|Acad mie H tel Sa...|     other|    nothing|-0.1511|
|Acad mie H tel Sa...|     staff|      staff| 0.4215|
|Acad mie H tel Sa...|      food|     dinner|    0.0|
|The Premier Notti...|     other|   bathroom|    0.0|
|The Premier Notti...|     o

In [35]:
spark.sql("SELECT hotel, categories, AVG(score) as avg_scores, COUNT(score) as n_scores \
FROM categories GROUP BY hotel, categories ORDER BY hotel, avg_scores  ")

DataFrame[hotel: string, categories: string, avg_scores: double, n_scores: bigint]

# Export final dataframe

In [None]:
log.toLog('start group by hotel/categories')

In [None]:
import pyspark.sql.functions as func

In [None]:
df_final = df_categories.groupBy("hotel").pivot('categories').agg(func.avg('score'))

In [None]:
df_final.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_categories.csv", header = True)

In [None]:
log.toLog('end feature extraction')
log.close()