In [4]:
import toLog
log = toLog.log('Feature extraction starting')

In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName('Python spark')
         .config('spark.some.config.option','some-value')
         .getOrCreate()
         )

In [2]:
# read file from hdfs and infer schema
df_cleaned = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_cleaned.csv", header = True, inferSchema = True)
df_cleaned.printSchema()

root
 |-- id: long (nullable = true)
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- Hotel_Country: string (nullable = true)



In [3]:
df_cleaned.rdd.filter(lambda x: x['Review'] is None).count()

0

In [4]:
# register table (if not exists)
try:
    df_cleaned.createTempView('hotels')
except:
    pass

In [5]:
# selecting only the reviews using spark.sql
df_hotels = spark.sql("SELECT Hotel_Name, Review, lat, lng FROM hotels")
df_hotels.show()

+--------------------+--------------------+----------+----------+
|          Hotel_Name|              Review|       lat|       lng|
+--------------------+--------------------+----------+----------+
|         Hotel Arena|staff in restaura...|52.3605759| 4.9159683|
|         Hotel Arena|. overall hotel w...|52.3605759| 4.9159683|
|    K K Hotel George|no bad experience...|51.4918878|-0.1949706|
|    K K Hotel George|room is small any...|51.4918878|-0.1949706|
|Apex Temple Court...|my son and his fa...|51.5137335|-0.1087512|
|Apex Temple Court...|nothing to improv...|51.5137335|-0.1087512|
|Apex Temple Court...|my and my wife ca...|51.5137335|-0.1087512|
|Apex Temple Court...|expensive but thi...|51.5137335|-0.1087512|
|Apex Temple Court...|nothing not to li...|51.5137335|-0.1087512|
|Apex Temple Court...|. rooms are comfo...|51.5137335|-0.1087512|
|The Park Grand Lo...|socket for pluggi...|51.5142184|-0.1809032|
|The Park Grand Lo...|. staff very frie...|51.5142184|-0.1809032|
|The Park 

In [6]:
print(df_hotels.count())

504989


In [7]:
rddHotels = df_hotels.select('Hotel_Name', 'Review', 'lat', 'lng').rdd

# Features extraction

In [8]:
from getTriples import getTriples

In [9]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [10]:
def splitTriple(record):
    res = []
    for tripla in record[1]:
        res.append((record[0],tripla))
        
    return res

In [15]:
(rddHotels
 .map(lambda x: (x['Hotel_Name'], x['Review'] ))
 .map(lambda x: (x[0], getTriples(x[1])))
 .flatMap(splitTriple)).take(40)

[('Hotel Arena', ('building', 'very', 'historic')),
 ('Hotel Arena', ('hotel', '', 'overall')),
 ('Hotel Arena', ('work', '', 'complete')),
 ('K K Hotel George', ('experiences', '', 'bad')),
 ('K K Hotel George', ('nothing', '', 'wrong')),
 ('K K Hotel George', ('hotel', '', 'excellent')),
 ('K K Hotel George', ('room', 'very', 'comfortable')),
 ('K K Hotel George', ('size', '', 'good')),
 ('K K Hotel George', ('hotel', '', 'london')),
 ('K K Hotel George', ('tidy', '', 'clean')),
 ('K K Hotel George', ('staff', 'well', 'equipped')),
 ('K K Hotel George', ('class', '', 'first')),
 ('K K Hotel George', ('menu', '', 'comprehensive')),
 ('K K Hotel George', ('quality', '', 'good')),
 ('K K Hotel George', ('room', '', '')),
 ('K K Hotel George', ('anywaylondon', '', 'small')),
 ('Apex Temple Court Hotel', ('family', '', 'staying')),
 ('Apex Temple Court Hotel', ('rooms', '', 'identical')),
 ('Apex Temple Court Hotel', ('apex', 'directly', '')),
 ('Apex Temple Court Hotel', ('breakfasts', '

In [19]:
log.toLog('starting triples extraction')

In [16]:
SAMPLE_PERC = 0.001
df_features = (rddHotels
#.sample(False, SAMPLE_PERC) 
 .filter(lambda x: x['Review'] is not None)
 .map(lambda x: (x['Hotel_Name'], getTriples(x['Review'])))
 .flatMap(splitTriple)
 .map(lambda x:( x[0], x[1][0], vader.polarity_scores(" ".join(x[1]))['compound'] )  )
).toDF(["hotel", "feature", "scores"])

In [17]:
# register table (if not exists)
try:
    df_features.createTempView('features')
except:
    spark.catalog.dropTempView('features')
    df_features.createTempView('features')

In [18]:
spark.sql("SELECT hotel, feature, scores from features").show()

+--------------------+----------+-------+
|               hotel|   feature| scores|
+--------------------+----------+-------+
|Novotel Paris Cen...|      food|  0.296|
| The Harmonie Vienna|     hotel|    0.0|
| The Harmonie Vienna| proximity| 0.6249|
| The Harmonie Vienna|    center|    0.0|
| The Harmonie Vienna|     hotel|    0.0|
| The Harmonie Vienna|   station|    0.0|
| The Harmonie Vienna|     staff|    0.0|
| The Harmonie Vienna|      room|    0.0|
| The Harmonie Vienna|everything|    0.0|
| The Harmonie Vienna|     staff| 0.3612|
| The Harmonie Vienna|      mile|    0.0|
| The Harmonie Vienna|     hours|    0.0|
|Britannia Interna...|      room|    0.0|
|Britannia Interna...|  building|-0.1779|
|Britannia Interna...|      room|    0.0|
|Britannia Interna...|     value| 0.6486|
|Britannia Interna...|    london| 0.4404|
|Britannia Interna...|      area|    0.0|
| Strand Palace Hotel|     hotel| 0.5563|
| Strand Palace Hotel|     staff| 0.4754|
+--------------------+----------+-

In [19]:
spark.sql("SELECT hotel, feature, \
AVG(scores) as avg_scores, COUNT(scores) as n_scores \
FROM features GROUP BY hotel, feature \
ORDER BY hotel, avg_scores  ").show(40)

+--------------------+----------+----------+--------+
|               hotel|   feature|avg_scores|n_scores|
+--------------------+----------+----------+--------+
|                  41|     staff|       0.0|       1|
|                  41|     money|    0.2263|       1|
|                  41|  location|    0.5574|       1|
|                  41|     hotel|    0.5574|       1|
|ARCOTEL Kaiserwas...|      room|       0.0|       1|
|ARCOTEL Kaiserwas...|     style|       0.0|       1|
|ARCOTEL Kaiserwas...|      beds|       0.0|       1|
|ARCOTEL Kaiserwas...|       bed|       0.0|       1|
|Albus Hotel Amste...|  location|       0.0|       1|
|Albus Hotel Amste...|     hotel|       0.0|       1|
|Albus Hotel Amste...|    market|    0.4576|       1|
|Albus Hotel Amste...|     staff|    0.5413|       1|
|  Aloft London Excel| breakfast|       0.0|       1|
|  Aloft London Excel|      year|       0.0|       1|
|  Aloft London Excel|facilities|       0.0|       1|
|  Aloft London Excel|     v

In [20]:
df_features.count()

1701

In [21]:
import json
# reading the defined categories
with open('final_categories.json') as f:
     categories = json.load(f)

In [22]:
categories

{'bathroom': ['washbasin',
  'restroom',
  'cubicle',
  'bathrooms',
  'ensuite',
  'fixture',
  'bathroom.',
  'toilette',
  'washroom',
  'toilet',
  'bath',
  'shower',
  'bathtub',
  'towels',
  'showerhead',
  'wc'],
 'facilities': ['bar',
  'supplies',
  'facility.',
  'equipment',
  'furniture',
  'ammenities',
  'lift',
  'facilities.',
  'tv',
  'facility',
  'facilties',
  'facilites',
  'spa.',
  'amenities',
  'massages',
  'equipments',
  'pool'],
 'food': ['breakfest',
  'grill',
  'restaurants',
  'menu.',
  'breakfeast',
  'food.',
  'food',
  'resturant',
  'selections',
  'coffee',
  'resteraunt',
  'seafood',
  'menu',
  'sushi',
  'bfast',
  'brekkie',
  'tea',
  'dinner.',
  'breakfast',
  'carvery',
  'foods',
  'breakfast.',
  'bistro',
  'breakfasts',
  'menus',
  'breafast',
  'brekfast',
  'buffet',
  'meals',
  'presentation',
  'beakfast',
  'brakfast',
  'brasserie',
  'drinks',
  'breackfast',
  'steak',
  'assortment',
  'wines'],
 'internet': ['4g',
  'w

In [23]:
def assign_categories(feat):
    for cat,dict_feat in categories.items():
        if feat == cat or feat in dict_feat  :
            return cat
    
    return 'other'
    

In [24]:
df_categories = (df_features.rdd
                 .map(lambda x: (x['hotel'],x['feature'],assign_categories(x['feature']), x['scores'] ))
                 .toDF(['hotel', 'feature', 'categories', 'score'])
                )

In [25]:
df_categories.head(20)

[Row(hotel='Novotel Paris Centre Gare Montparnasse', feature='food', categories='food', score=0.296),
 Row(hotel='The Harmonie Vienna', feature='hotel', categories='other', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='proximity', categories='other', score=0.6249),
 Row(hotel='The Harmonie Vienna', feature='center', categories='other', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='hotel', categories='other', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='station', categories='other', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='staff', categories='staff', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='room', categories='room', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='everything', categories='other', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='staff', categories='staff', score=0.3612),
 Row(hotel='The Harmonie Vienna', feature='mile', categories='other', score=0.0),
 Row(hotel='The Harmonie Vienna', feature='hours', ca

In [None]:
df_categories.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_features.csv", header = True)

# Features cleaning

In [3]:
df_features = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_features.csv", header = True, inferSchema = True)

In [4]:
df_features.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- feature: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- score: double (nullable = true)



In [5]:
try:
    df_features.createTempView('df_features')
except:
    spark.catalog.dropTempView('df_features')
    df_features.createTempView('df_features')

In [35]:
#remove all features with score 0.0
df_features_temp = spark.sql("SELECT * FROM df_features WHERE score != 0.0 and categories != 'other'")

In [36]:
try:
    df_features_temp.createTempView('temp')
except:
    spark.catalog.dropTempView('temp')
    df_features_temp.createTempView('temp')

In [37]:
spark.sql("SELECT hotel, count(categories) as conto FROM temp WHERE categories != 'other' GROUP BY hotel having conto > 600  order by conto DESC ").show()

+--------------------+-----+
|               hotel|conto|
+--------------------+-----+
|Park Plaza Westmi...| 4141|
| Strand Palace Hotel| 3827|
|Britannia Interna...| 3662|
|Copthorne Tara Ho...| 3259|
|DoubleTree by Hil...| 3233|
|Intercontinental ...| 2631|
|Grand Royale Lond...| 2627|
|Holiday Inn Londo...| 2584|
|Millennium Glouce...| 2324|
|Park Plaza County...| 2229|
|Park Grand Paddin...| 2218|
|Hilton London Met...| 2211|
|DoubleTree by Hil...| 2204|
|Park Grand London...| 2196|
|      Hotel Esther a| 2182|
| Blakemore Hyde Park| 2168|
|Hilton London Wem...| 2060|
|Park Plaza London...| 2019|
|St James Court A ...| 1991|
|The Tower A Guoma...| 1891|
+--------------------+-----+
only showing top 20 rows



In [38]:
df_features_cleaned = spark.sql("SELECT hotel, categories, feature, score, count(categories) over(partition by(hotel) order by score) as conto, hotel FROM temp WHERE categories != 'other' ORDER BY conto DESC").filter('conto > 70')

In [39]:
df_features_cleaned.count()

438259

In [40]:
# register table (if not exists)
try:
    df_features_cleaned.createTempView('categories')
except:
    spark.catalog.dropTempView('categories')
    df_features_cleaned.createTempView('categories')

In [41]:
spark.sql("SELECT hotel, categories, feature, score from categories").show()

+--------------------+----------+----------+------+
|               hotel|categories|   feature| score|
+--------------------+----------+----------+------+
|Park Plaza Westmi...|  location|     views| 0.839|
|Park Plaza Westmi...|     staff|     staff|0.8367|
|Park Plaza Westmi...|  location|     views|0.8316|
|Park Plaza Westmi...|      room|      room|0.8271|
|Park Plaza Westmi...|     staff|     staff|0.8258|
|Park Plaza Westmi...|      room|      room|0.8225|
|Park Plaza Westmi...|  location|     views|0.8211|
|Park Plaza Westmi...|  location|     views|0.8162|
|Park Plaza Westmi...|  location|  location|0.8074|
|Park Plaza Westmi...|  location|     views|0.8074|
|Park Plaza Westmi...|     staff|     staff|0.8074|
|Park Plaza Westmi...|      room|      room|0.7964|
|Park Plaza Westmi...|      room|      room|0.7964|
|Park Plaza Westmi...|      room|      room|0.7964|
|Park Plaza Westmi...|facilities|facilities|0.7964|
|Park Plaza Westmi...|      room|      room|0.7964|
|Park Plaza 

In [42]:
spark.sql("SELECT hotel, categories, AVG(score) as avg_scores, COUNT(score) as n_scores \
FROM categories GROUP BY hotel, categories ORDER BY hotel, avg_scores  ").show()

+--------------------+----------+-------------------+--------+
|               hotel|categories|         avg_scores|n_scores|
+--------------------+----------+-------------------+--------+
|  11 Cadogan Gardens|facilities| 0.5566666666666666|       3|
|  11 Cadogan Gardens|      food| 0.5716857142857144|       7|
|  11 Cadogan Gardens|      room| 0.5717533333333333|      15|
|  11 Cadogan Gardens|  location|          0.5947625|      24|
|  11 Cadogan Gardens|  bathroom|            0.59548|       5|
|  11 Cadogan Gardens|     staff| 0.6027156250000001|      32|
|            1K Hotel|      food|0.48300000000000004|      11|
|            1K Hotel|  bathroom| 0.5076333333333333|       3|
|            1K Hotel|  internet|             0.5106|       1|
|            1K Hotel|     staff|            0.51512|      25|
|            1K Hotel|      room| 0.5401684210526316|      19|
|            1K Hotel|  location| 0.5581916666666668|      36|
|            1K Hotel|facilities|            0.61005|  

# Export final dataframe

In [None]:
log.toLog('start group by hotel/categories')

In [43]:
import pyspark.sql.functions as func

In [44]:
df_final = df_features_cleaned.groupBy("hotel").pivot('categories').agg(func.avg('score'))

In [45]:
df_final.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_categories.csv", header = True)

In [None]:
log.toLog('end feature extraction')
log.close()