In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName('ddam_project')
         .config('spark.some.config.option','some-value')
         .getOrCreate()
         )

In [2]:
# read file from hdfs and infer schema
df_cleaned = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_cleaned.csv", header = True, inferSchema = True)
df_cleaned.printSchema()

root
 |-- id: long (nullable = true)
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- Hotel_Country: string (nullable = true)



In [3]:
df_cleaned.count()

504989

# VADER evaluation

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [5]:
import numpy as np

In [6]:
vader = SentimentIntensityAnalyzer()

In [7]:
# applying vader to each review and standardize score in [0,10]
rdd_scores = (df_cleaned.select("id", "Review", "Reviewer_Score").rdd
              #.filter(lambda row: row['Review'] is not None)
              .map(lambda x: (x["id"], 
                              ((vader.polarity_scores(x["Review"])['compound']  ) +1)*5,
                              x["Reviewer_Score"]))
             )

#.toDF(["id", "Vader_scores", "User_scores"])
#df_scores.printSchema()

In [8]:
rdd_scores.take(10)

[(14, 9.1835, 8.8),
 (374, 8.591999999999999, 9.2),
 (681, 9.8475, 8.8),
 (860, 5.0, 9.6),
 (1014, 5.0, 9.2),
 (1123, 9.643999999999998, 10.0),
 (1162, 3.4684999999999997, 5.0),
 (1318, 9.907, 9.2),
 (1375, 9.5775, 9.6),
 (1705, 8.7125, 9.2)]

# Evaluation

In [9]:
def errors(rdd, ix, iy):
    ''' 
    rdd= rdd composed by lists/tuples 
    ix: index of the first variable
    iy: index of the second variable
    return: dictionary with 
    {'rmse': root mean squared error
     'mae' : mean absolute error
    }
    
    '''
    # acc = ('counts', sum of abs errors', 'sum of squared errors')
    acc = (0, 0, 0)
    def mergeValue(acc, value):
        return (acc[0] + 1, acc[1] + value,  acc[2] + value**2)
    
    def mergeAccum(acc1,acc2):
        return (acc1[0] + acc2[0], acc1[1] + acc2[1], acc1[2]+acc2[2])
    
    acc = (rdd.map(lambda x: ( np.abs(x[ix] - x[iy]) ))
       .aggregate(acc, mergeValue, mergeAccum)
      )
    
    return {'rmse':np.sqrt(acc[2]/acc[0]) , 'mae': acc[1]/acc[0] }
    
    
    

In [10]:
rdd_scores.take(20)

[(14, 9.1835, 8.8),
 (374, 8.591999999999999, 9.2),
 (681, 9.8475, 8.8),
 (860, 5.0, 9.6),
 (1014, 5.0, 9.2),
 (1123, 9.643999999999998, 10.0),
 (1162, 3.4684999999999997, 5.0),
 (1318, 9.907, 9.2),
 (1375, 9.5775, 9.6),
 (1705, 8.7125, 9.2),
 (2500, 9.749, 8.3),
 (3001, 9.338, 9.6),
 (3056, 7.1075, 9.2),
 (3422, 5.0, 9.6),
 (3530, 5.7655, 7.1),
 (3676, 3.9715, 7.1),
 (3739, 9.1125, 9.6),
 (3890, 8.5015, 8.3),
 (4181, 8.8515, 7.5),
 (4230, 2.213, 8.3)]

In [11]:
vader_errors = errors(rdd_scores, 1, 2)
vader_errors

{'mae': 1.6846593123810762, 'rmse': 2.3336413418498716}

# Nationality Bias

### mean bias correction
first we add the mean bias of each country to the vader score  
Correcting the score using the mean bias, the vader performance increase?


In [12]:
df_cleaned = df_cleaned.select( "Review", "Reviewer_Nationality", "Reviewer_Score")

In [13]:
df_bias_mean = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_bias_mean.csv", header = True, inferSchema = True)
df_bias_mean = df_bias_mean.select("Reviewer_Nationality", "Bias")

In [14]:
df_joined = df_cleaned.join(df_bias_mean, 'Reviewer_Nationality', 'left')

In [15]:
print(df_joined.count())
df_joined.show(10)

504989
+--------------------+--------------------+--------------+--------------------+
|Reviewer_Nationality|              Review|Reviewer_Score|                Bias|
+--------------------+--------------------+--------------+--------------------+
|              Canada|staff in restaura...|           8.8|  0.1549723388282711|
|      United Kingdom|. overall hotel w...|           9.2| 0.08905211107880007|
|      United Kingdom|no bad experience...|           8.8| 0.08905211107880007|
|      United Kingdom|room is small any...|           9.6| 0.08905211107880007|
|      United Kingdom|my son and his fa...|           9.2| 0.08905211107880007|
|           Australia|nothing to improv...|          10.0| 0.19915391522263093|
|      United Kingdom|my and my wife ca...|           5.0| 0.08905211107880007|
|      United Kingdom|expensive but thi...|           9.2| 0.08905211107880007|
|      United Kingdom|nothing not to li...|           9.6| 0.08905211107880007|
|              Brazil|. rooms are

In [16]:
try:
    df_joined.createTempView('bias_mean')
except:
    spark.catalog.dropTempView('bias_mean')
    df_joined.createTempView('bias_mean')

In [17]:
spark.sql("SELECT Reviewer_Nationality, count(*) FROM bias_mean WHERE Bias is null GROUP BY Reviewer_Nationality ").show()

+--------------------+--------+
|Reviewer_Nationality|count(1)|
+--------------------+--------+
|Turks Caicos Islands|      14|
|            Paraguay|      28|
|            Anguilla|       1|
|               Yemen|      16|
|          St Maarten|      11|
|             Senegal|      22|
|            Kiribati|       2|
|              Guyana|       5|
|             Eritrea|       2|
|            Djibouti|       2|
|                Fiji|      11|
|              Malawi|      10|
|Northern Mariana ...|       1|
|             Comoros|       1|
|            Cambodia|      32|
|         Afghanistan|       7|
|              Crimea|       6|
|            Maldives|      47|
|              Rwanda|      12|
|         Ivory Coast|      19|
+--------------------+--------+
only showing top 20 rows



In [18]:
def vader_bias(x):
  
    if x["Bias"] == None:
        score = (((vader.polarity_scores(x["Review"])['compound']  ) +1)*5 ) + 0
    else:
        score = (((vader.polarity_scores(x["Review"])['compound']  ) +1)*5 ) + x["Bias"]
        
    # handling scores 'overflow'
    if score > 10:
        score = 10
    elif score < 0:
        score = 0
        
    return (x['Reviewer_Score'], score)

In [19]:
rdd_bias_mean = (df_joined.rdd
              #.filter(lambda row: row['Review'] is not None) 
              .map(vader_bias)
             )
rdd_bias_mean.take(20)

[(8.8, 9.338472338828272),
 (9.2, 8.681052111078799),
 (8.8, 9.9365521110788),
 (9.6, 5.0890521110788),
 (9.2, 5.0890521110788),
 (10.0, 9.84315391522263),
 (5.0, 3.5575521110787998),
 (9.2, 9.9960521110788),
 (9.6, 9.6665521110788),
 (9.2, 8.752844487213734),
 (8.3, 9.478764599475605),
 (9.6, 9.4270521110788),
 (9.2, 7.1965521110788),
 (9.6, 4.487335634316863),
 (7.1, 5.8545521110788),
 (7.1, 3.6853268377741637),
 (9.6, 9.2015521110788),
 (8.3, 8.277453199096339),
 (7.5, 8.482835961193082),
 (8.3, 2.3020521110788)]

In [21]:
vader_errors_bias_mean = errors(rdd_bias_mean, 0, 1)

In [22]:
vader_errors_bias_mean

{'mae': 1.6907125757447694, 'rmse': 2.3470459578524867}

In [23]:
vader_errors

{'mae': 1.6846593123810762, 'rmse': 2.3336413418498716}

The performance are not improved using the mean bias correction, let's try with the median

### Median bias correction

In [24]:
df_bias_median = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_bias_median.csv", header = True, inferSchema = True)
df_bias_median = df_bias_median.select("Reviewer_Nationality", "Bias_median", "N").orderBy('Bias_median')
df_bias_median.show(100)

+--------------------+-------------------+------+
|Reviewer_Nationality|        Bias_median|     N|
+--------------------+-------------------+------+
|               Macau|-0.9000000000000004|   105|
|          Azerbaijan|-0.9000000000000004|   262|
|          Bangladesh|-0.9000000000000004|   140|
|            Pakistan|-0.9000000000000004|   874|
|                Iran|-0.9000000000000004|  1033|
|               Ghana|               -0.5|   137|
|           Hong Kong|               -0.5|  2967|
|           Singapore|               -0.5|  3047|
|                Oman|               -0.5|  1295|
|                null|               -0.5|   511|
|           Indonesia|               -0.5|  1493|
|        Saudi Arabia|               -0.5|  8413|
|         Switzerland|               -0.5|  8555|
|             Belgium|               -0.5|  5918|
|United Arab Emirates|               -0.5|  9779|
|               Qatar|               -0.5|  2583|
|               Kenya|               -0.5|   258|


In [25]:
df_bias = df_bias_mean.join(df_bias_median, 'Reviewer_Nationality', 'inner').orderBy('Bias_median', ascending = False)

In [26]:
# looking at Bias (mean) vs Bias median
df_bias.show(200)

+--------------------+--------------------+-------------------+------+
|Reviewer_Nationality|                Bias|        Bias_median|     N|
+--------------------+--------------------+-------------------+------+
|         Puerto Rico| 0.36745775855778184| 0.7999999999999989|   169|
|          Costa Rica| 0.14354340693808254| 0.3999999999999986|   118|
|              Israel| 0.30183631013680845| 0.3999999999999986|  6456|
|United States of ...| 0.39490620877812255| 0.3999999999999986| 34887|
|         New Zealand| 0.26012307325992623| 0.3999999999999986|  3195|
|      United Kingdom| 0.08905211107880007| 0.3999999999999986|241061|
|           Australia| 0.19915391522263093| 0.3999999999999986| 21412|
|              Panama| 0.39599161786087045| 0.3999999999999986|   117|
|              Mexico| 0.14564439563864973| 0.3999999999999986|   576|
|         South Korea|-0.00681528818721...| 0.3999999999999986|  1049|
|              Canada|  0.1549723388282711| 0.3999999999999986|  7792|
|     

In [27]:
df_joined_median = df_cleaned.join(df_bias_median, 'Reviewer_Nationality', 'left')

In [28]:
try:
    df_joined_median.createTempView('bias_median')
except:
    spark.catalog.dropTempView('bias_median')
    df_joined_median.createTempView('bias_median')

In [29]:
def vader_bias_median(x):
  
    if x["Bias_median"] == None:
        score = (((vader.polarity_scores(x["Review"])['compound']  ) +1)*5 ) + 0
    else:
        score = (((vader.polarity_scores(x["Review"])['compound']  ) +1)*5 ) + x["Bias_median"]
        
    # handling scores 'overflow'
    if score > 10:
        score = 10
    elif score < 0:
        score = 0
        
    return (x['Reviewer_Score'], score)

In [30]:
rdd_bias_median = (df_joined_median.rdd
              #.filter(lambda row: row['Review'] is not None) 
              .map(vader_bias_median)
             )
rdd_bias_median.take(20)

[(8.8, 9.583499999999999),
 (9.2, 8.991999999999997),
 (8.8, 10),
 (9.6, 5.399999999999999),
 (9.2, 5.399999999999999),
 (10.0, 10),
 (5.0, 3.8684999999999983),
 (9.2, 10),
 (9.6, 9.9775),
 (9.2, 8.7125),
 (8.3, 9.249),
 (9.6, 9.737999999999998),
 (9.2, 7.5074999999999985),
 (9.6, 4.5),
 (7.1, 6.165499999999999),
 (7.1, 3.4715),
 (9.6, 9.5125),
 (8.3, 8.5015),
 (7.5, 8.3515),
 (8.3, 2.6129999999999987)]

In [35]:
vader_errors_bias_median = errors(rdd_bias_median, 0, 1)

In [36]:
vader_errors_bias_median

{'mae': 1.6590399068098154, 'rmse': 2.307100807665857}

In [37]:
vader_errors_bias_mean

{'mae': 1.6907125757447694, 'rmse': 2.3470459578524867}

In [38]:
vader_errors

{'mae': 1.6846593123810762, 'rmse': 2.3336413418498716}

Le performance sono praticamente le stesse, solo leggermente migliorate applicando il bias mediano.  

Globalmente non si hanno miglioramenti significativi.  

Le performance sui singoli paesi migliorano?



