In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName('ddam_project')
         .config('spark.some.config.option','some-value')
         .getOrCreate()
         )

In [2]:
# read file from hdfs and infer schema
df_raw = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/Hotel_Reviews.csv", header = True, inferSchema = True)
df_raw.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [3]:
from pyspark.sql.functions import monotonically_increasing_id

In [4]:
df_raw_id = df_raw.withColumn('id', monotonically_increasing_id())
df_raw_id.printSchema()
df_raw_id.take(1)

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- id: long (nullable = false)



[Row(Hotel_Address=' s Gravesandestraat 55 Oost 1092 AA Amsterdam Netherlands', Additional_Number_of_Scoring=194, Review_Date='8/3/2017', Average_Score=7.7, Hotel_Name='Hotel Arena', Reviewer_Nationality=' Russia ', Negative_Review=' I am so angry that i made this post available via all possible sites i use when planing my trips so no one will make the mistake of booking this place I made my booking via booking com We stayed for 6 nights in this hotel from 11 to 17 July Upon arrival we were placed in a small room on the 2nd floor of the hotel It turned out that this was not the room we booked I had specially reserved the 2 level duplex room so that we would have a big windows and high ceilings The room itself was ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it I intimately asked to change the room and after explaining 2 times that i booked a duplex btw it costs t

## Reviews transformation

In [5]:
def catReviews(row):
    if row["Negative_Review"] == "No Negative" and row["Positive_Review"] == "No Positive":
        return "EMPTY"
    else:
        if row["Negative_Review"] == "No Negative":
            return (row["Positive_Review"].lower())
        elif row["Positive_Review"] == "No Positive":
            return (row["Negative_Review"].lower() )
        else:
            return(row["Negative_Review"].lower()  + ". " + row["Positive_Review"].lower())
            #users_ratings.append(row[12])       
        

In [6]:
def correction(row):
    return (row.replace(" don t ", " don't ")
            .replace(" didn t ", " didn't ")
            .replace(" haven t ", " haven't ")
            .replace(" hadn t ", " hadn't ")
            .replace(" isn t ", " isn't ")
            .replace(" weren t ", " weren't ")
            .replace(" wasn t ", " wasn't ")
            .replace(" dont ", " don't ")
            .replace(" didnt ", " didn't ")
            .replace(" i ", " I ")
           )

In [53]:
#counting rows with empty reviews
#df_raw_id.rdd.map(lambda x: (x['id'], catReviews(x))
#                  .filter(lambda x: x[1] == "EMPTY").count().take(1) )

PythonRDD[72] at RDD at PythonRDD.scala:48

In [7]:
rdd_reviews = (df_raw_id.rdd.map(lambda x: (x['id'], catReviews(x)))
               .filter(lambda x: x[1] != "EMPTY")
               .map(lambda x: (x[0], correction(x[1]))) 
              )

In [None]:
#rdd_reviews.take(3)

# Keep only English reviews

In [8]:
import langdetect as ld

In [9]:
#detect english reviews


#reviews_rdd.map(ld.detect).take(10)
def detect_Eng(review):
    if (len(review) < 100 ):
        return True
    try:
        if ld.detect(review)== 'en':
            return True
        else:
            return False
    except:
        return True

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 9.3 µs


In [32]:
#stampa le reviews in lingua differente dall'inglese
#rdd_reviews.filter(lambda x: not detect_Eng(x[1])).take(30)

In [10]:
#remove EMPTY reviews, keep only english reviews
#creiamo un nuovo data frame(df_revs) con colonne id, Review

df_revs = rdd_reviews.filter(lambda x: detect_Eng(x[1])).toDF(['id','Review'])
df_revs.printSchema()

root
 |-- id: long (nullable = true)
 |-- Review: string (nullable = true)



In [22]:
#df_revs.take(3)

In [11]:
#creiamo un nuovo dataframe con le review modificate, eliminando quelle "vecchie" e senza contare 
df_cleaned = df_raw_id.join(df_revs, ['id']).drop("Positive_Review", "Negative_Review")

In [13]:
df_cleaned.take(3)

[Row(id=26, Hotel_Address=' s Gravesandestraat 55 Oost 1092 AA Amsterdam Netherlands', Additional_Number_of_Scoring=194, Review_Date='5/25/2017', Average_Score=7.7, Hotel_Name='Hotel Arena', Reviewer_Nationality=' United Kingdom ', Review_Total_Negative_Word_Counts=51, Total_Number_of_Reviews=1403, Review_Total_Positive_Word_Counts=134, Total_Number_of_Reviews_Reviewer_Has_Given=2, Reviewer_Score=9.6, Tags="[' Leisure trip ', ' Group ', ' Duplex Double Room ', ' Stayed 2 nights ']", days_since_review='70 days', lat='52.3605759', lng='4.9159683', Review=' nothing at all to do with the hotel of course but people tend to slam their bedroom doors as they leave so if you re thinking of having a little lay in just be prepared for a slam to awake you other than that nothing to fault at all .  the hotel itself is in a lovely location a 5min if that tram ride into the center train right outside the hotel easy access to everywhere the staff are super friendly and always on hand to help and advic

In [None]:
 df_cleaned.write.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_cleaned.csv", header = True)