In [4]:
import toLog
log = toLog.log('Feature extraction starting')

In [5]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName('Python spark')
         .config('spark.some.config.option','some-value')
         .getOrCreate()
         )

In [6]:
# read file from hdfs and infer schema
df_cleaned = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_cleaned.csv", header = True, inferSchema = True)
df_cleaned.printSchema()

root
 |-- id: long (nullable = true)
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- Hotel_Country: string (nullable = true)



In [7]:
df_cleaned.rdd.filter(lambda x: x['Review'] is None).count()

0

In [8]:
# register table (if not exists)
try:
    df_cleaned.createTempView('hotels')
except:
    pass

In [9]:
# selecting only the reviews using spark.sql
df_hotels = spark.sql("SELECT Hotel_Name, Review, lat, lng FROM hotels")
df_hotels.show()

+--------------------+--------------------+----------+----------+
|          Hotel_Name|              Review|       lat|       lng|
+--------------------+--------------------+----------+----------+
|         Hotel Arena|the staff in the ...|52.3605759| 4.9159683|
|         Hotel Arena|. the overall hot...|52.3605759| 4.9159683|
|    K K Hotel George|no bad experience...|51.4918878|-0.1949706|
|    K K Hotel George|the room is bit s...|51.4918878|-0.1949706|
|Apex Temple Court...|my son and his fa...|51.5137335|-0.1087512|
|Apex Temple Court...|nothing to improv...|51.5137335|-0.1087512|
|Apex Temple Court...|my self and my wi...|51.5137335|-0.1087512|
|Apex Temple Court...|expensive but thi...|51.5137335|-0.1087512|
|Apex Temple Court...|nothing not to li...|51.5137335|-0.1087512|
|Apex Temple Court...|. rooms are comfo...|51.5137335|-0.1087512|
|The Park Grand Lo...|a socket for plug...|51.5142184|-0.1809032|
|The Park Grand Lo...|. staff very frie...|51.5142184|-0.1809032|
|The Park 

In [10]:
print(df_hotels.count())

505009


In [11]:
rddHotels = df_hotels.select('Hotel_Name', 'Review', 'lat', 'lng').rdd

# Features extraction

In [12]:
from getTriples import getTriples

In [13]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [14]:
def splitTriple(record):
    res = []
    for tripla in record[1]:
        res.append((record[0],tripla))
        
    return res

In [18]:
(rddHotels
 .filter(lambda x: x['Review'] is not None)
 .map(lambda x: (x['Hotel_Name'], x['Review'] )
 .map(lambda x: (x[0], getTriples(x[1])))
 .flatMap(splitTriple)
).take(40)

[('Hotel Arena', ('building', 'very', 'historic')),
 ('Hotel Arena', ('hotel', '', 'overall')),
 ('Hotel Arena', ('work', '', 'complete')),
 ('K K Hotel George', ('experiences', '', 'bad')),
 ('K K Hotel George', ('nothing', '', 'wrong')),
 ('K K Hotel George', ('hotel', '', 'excellent')),
 ('K K Hotel George', ('room', 'very', 'comfortable')),
 ('K K Hotel George', ('size', '', 'good')),
 ('K K Hotel George', ('hotel', '', 'london')),
 ('K K Hotel George', ('tidy', '', 'clean')),
 ('K K Hotel George', ('staff', 'well', 'equipped')),
 ('K K Hotel George', ('class', '', 'first')),
 ('K K Hotel George', ('menu', '', 'comprehensive')),
 ('K K Hotel George', ('quality', '', 'good')),
 ('K K Hotel George', ('room', '', '')),
 ('K K Hotel George', ('anywaylondon', '', 'small')),
 ('Apex Temple Court Hotel', ('family', '', 'staying')),
 ('Apex Temple Court Hotel', ('rooms', '', 'identical')),
 ('Apex Temple Court Hotel', ('apex', 'directly', '')),
 ('Apex Temple Court Hotel', ('breakfasts', '

In [19]:
log.toLog('starting triples extraction')

In [20]:
SAMPLE_PERC = 0.001
df_features = (rddHotels
 .sample(False, SAMPLE_PERC) 
 .filter(lambda x: x['Review'] is not None)
 .map(lambda x: (x['Hotel_Name'], getTriples(x['Review'])))
 .flatMap(splitTriple)
 .map(lambda x:( x[0], x[1][0], vader.polarity_scores(" ".join(x[1]))['compound'] )  )
).toDF(["hotel", "feature", "scores"])

In [21]:
# register table (if not exists)
try:
    df_features.createTempView('features')
except:
    spark.catalog.dropTempView('features')
    df_features.createTempView('features')

In [22]:
spark.sql("SELECT hotel, feature, scores from features").show()

+--------------------+------------+------+
|               hotel|     feature|scores|
+--------------------+------------+------+
|  ME London by Melia|    everyone|   0.0|
|  ME London by Melia|       lover|0.5859|
|  ME London by Melia|    waitress|   0.0|
|  ME London by Melia|         bar|   0.0|
|  ME London by Melia|      charge|   0.0|
|  ME London by Melia|   refilling|   0.0|
|  ME London by Melia|       radio|   0.0|
|  ME London by Melia|         bar|0.5574|
|  ME London by Melia|       staff|0.5267|
|  ME London by Melia|       floor|   0.0|
|  ME London by Melia|architecture|   0.0|
|  ME London by Melia|         wow| 0.836|
|The Cumberland A ...|         bit|   0.0|
|The Cumberland A ...|    location|0.6249|
|The Cumberland A ...|      decent|   0.0|
|Mercure Amsterdam...|          tv|   0.0|
|Mercure Amsterdam...|        work|   0.0|
|Mercure Amsterdam...|     station|   0.0|
|Mercure Amsterdam...|    location|0.6249|
|Mercure Amsterdam...|        room|0.6124|
+----------

In [23]:
spark.sql("SELECT hotel, feature, \
AVG(scores) as avg_scores, COUNT(scores) as n_scores \
FROM features GROUP BY hotel, feature \
ORDER BY hotel, avg_scores  ").show(40)

+--------------------+-----------+----------+--------+
|               hotel|    feature|avg_scores|n_scores|
+--------------------+-----------+----------+--------+
|25hours Hotel bei...|   internet|   -0.5423|       1|
|25hours Hotel bei...|      guest|       0.0|       1|
|25hours Hotel bei...|       wifi|       0.0|       1|
|25hours Hotel bei...|     waiter|       0.0|       1|
|25hours Hotel bei...|        bit|       0.0|       2|
|25hours Hotel bei...|    terrace|       0.0|       1|
|25hours Hotel bei...|      style|       0.0|       1|
|25hours Hotel bei...|      staff|   0.48465|       2|
|25hours Hotel bei...|  breakfast|   0.50615|       2|
|25hours Hotel bei...|       view|    0.5574|       1|
|25hours Hotel bei...|   location|    0.6249|       1|
|   9Hotel Republique|      staff|     -0.43|       1|
|   9Hotel Republique|       city|       0.0|       1|
|   9Hotel Republique|        bed|    0.5563|       1|
|   9Hotel Republique|     design|    0.6249|       1|
|   9Hotel

In [24]:
df_features.count()

1968

In [25]:
import json
# reading the defined categories
with open('final_categories.json') as f:
     categories = json.load(f)

In [26]:
categories

{'bathroom': ['washbasin',
  'restroom',
  'cubicle',
  'bathrooms',
  'ensuite',
  'fixture',
  'bathroom.',
  'toilette',
  'washroom',
  'toilet',
  'bath',
  'shower',
  'bathtub',
  'towels',
  'showerhead',
  'wc'],
 'facilities': ['bar',
  'supplies',
  'facility.',
  'equipment',
  'furniture',
  'ammenities',
  'lift',
  'facilities.',
  'tv',
  'facility',
  'facilties',
  'facilites',
  'spa.',
  'amenities',
  'massages',
  'equipments',
  'pool'],
 'food': ['breakfest',
  'grill',
  'restaurants',
  'menu.',
  'breakfeast',
  'food.',
  'food',
  'resturant',
  'selections',
  'coffee',
  'resteraunt',
  'seafood',
  'menu',
  'sushi',
  'bfast',
  'brekkie',
  'tea',
  'dinner.',
  'breakfast',
  'carvery',
  'foods',
  'breakfast.',
  'bistro',
  'breakfasts',
  'menus',
  'breafast',
  'brekfast',
  'buffet',
  'meals',
  'presentation',
  'beakfast',
  'brakfast',
  'brasserie',
  'drinks',
  'breackfast',
  'steak',
  'assortment',
  'wines'],
 'internet': ['4g',
  'w

In [27]:
def assign_categories(feat):
    for cat,dict_feat in categories.items():
        if feat == cat or feat in dict_feat  :
            return cat
    
    return 'other'
    

In [28]:
df_categories = (df_features.rdd
                 .map(lambda x: (x['hotel'],x['feature'],assign_categories(x['feature']), x['scores'] ))
                 .toDF(['hotel', 'feature', 'categories', 'score'])
                )

In [30]:
df_categories.head(20)

[Row(hotel='ME London by Melia', feature='everyone', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='lover', categories='other', score=0.5859),
 Row(hotel='ME London by Melia', feature='waitress', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='bar', categories='facilities', score=0.0),
 Row(hotel='ME London by Melia', feature='charge', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='refilling', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='radio', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='bar', categories='facilities', score=0.5574),
 Row(hotel='ME London by Melia', feature='staff', categories='staff', score=0.5267),
 Row(hotel='ME London by Melia', feature='floor', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='architecture', categories='other', score=0.0),
 Row(hotel='ME London by Melia', feature='wow', categories='other

In [None]:
df_categories.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_features.csv", header = True)

In [31]:
# register table (if not exists)
try:
    df_categories.createTempView('categories')
except:
    spark.catalog.dropTempView('categories')
    df_categories.createTempView('categories')

In [32]:
spark.sql("SELECT hotel, categories, feature, score from categories").show()

+--------------------+----------+------------+------+
|               hotel|categories|     feature| score|
+--------------------+----------+------------+------+
|  ME London by Melia|     other|    everyone|   0.0|
|  ME London by Melia|     other|       lover|0.5859|
|  ME London by Melia|     other|    waitress|   0.0|
|  ME London by Melia|facilities|         bar|   0.0|
|  ME London by Melia|     other|      charge|   0.0|
|  ME London by Melia|     other|   refilling|   0.0|
|  ME London by Melia|     other|       radio|   0.0|
|  ME London by Melia|facilities|         bar|0.5574|
|  ME London by Melia|     staff|       staff|0.5267|
|  ME London by Melia|     other|       floor|   0.0|
|  ME London by Melia|     other|architecture|   0.0|
|  ME London by Melia|     other|         wow| 0.836|
|The Cumberland A ...|     other|         bit|   0.0|
|The Cumberland A ...|  location|    location|0.6249|
|The Cumberland A ...|     other|      decent|   0.0|
|Mercure Amsterdam...|facili

In [35]:
spark.sql("SELECT hotel, categories, AVG(score) as avg_scores, COUNT(score) as n_scores \
FROM categories GROUP BY hotel, categories ORDER BY hotel, avg_scores  ")

DataFrame[hotel: string, categories: string, avg_scores: double, n_scores: bigint]

# Export final dataframe

In [None]:
log.toLog('start group by hotel/categories')

In [None]:
import pyspark.sql.functions as func

In [None]:
df_final = df_categories.groupBy("hotel").pivot('categories').agg(func.avg('score'))

In [None]:
df_final.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_categories.csv", header = True)

In [None]:
log.toLog('end feature extraction')
log.close()