# Reading datasets in Google Collab

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/SIADS_591/Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/SIADS_591/Project


# Install & Load Packages

In [None]:
pip install altair vega_datasets



In [None]:
pip install plotly==5.5.0



In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 39 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 68.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805911 sha256=aa27ca072f630eca659699016676cbfa29c9f18e3c7b30f026ab5265c42aa9b3
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
# Load packages
import pandas as pd
from pyspark.sql import SparkSession
import nltk
nltk.download('stopwords')
from pyspark.sql.types import BooleanType,ArrayType, StringType
import plotly.graph_objects as go
from nltk.corpus import stopwords
from pyspark.sql.functions import udf, explode
import numpy as np
import altair as alt
import plotly.express as px
from scipy.stats import pearsonr

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Getting top 5 interesting words from the review dataset to form wordcloud

In [None]:
from pyspark.sql.functions import lit

# Initialize Spark
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('London') \
    .getOrCreate() 

# Read reviews csv into a spark dataframe
rev_spark_df = spark.read.csv('Datasets/reviews.csv')


# Negative listings are listings whose average rating is below 3 stars
Listings_df_reduced = Listings_df[['id', 'review_scores_rating']]
neg_Listings = Listings_df_reduced[(Listings_df_reduced['review_scores_rating'] >0 ) \
                                   &(Listings_df_reduced['review_scores_rating'] <3 ) ]

# Positive listings are listings whose average rating is 4.99 stars 
# (including 5 stars is too much data)
pos_Listings = Listings_df_reduced[(Listings_df_reduced['review_scores_rating'] ==4.99 )]

# Make the positive and negative listings IDs into a list
pos_ids = pos_Listings['id'].tolist()
neg_ids = neg_Listings['id'].tolist()

# Filter the spark dataframe to match the positive and negative IDs from the above list
pos_df = rev_spark_df.filter(rev_spark_df._c0.isin(pos_ids))
neg_df = rev_spark_df.filter(rev_spark_df._c0.isin(neg_ids))


# establish english stopwords into 'stopword'
stopword = stopwords.words('english')
## All user-defined functions
    # wordsplit splits the list of text by space
wordsplit = udf(lambda x: x.split(), ArrayType(StringType()))
    # stops returns a boolean of whether or not each word in split text belongs in stopword
stops = udf(lambda x: x.lower() in stopword, BooleanType())


neg_words = neg_df.select('_c0', wordsplit('_c5').alias('words')).withColumn('indiv_words', explode('words'))
neg_words = neg_words.select('indiv_words', stops('indiv_words').alias('T_F'))
neg_words = neg_words.filter(neg_words['T_F'] == 'false')

pos_words = pos_df.select('_c0', wordsplit('_c5').alias('words')).withColumn('indiv_words', explode('words'))
pos_words = pos_words.select('indiv_words', stops('indiv_words').alias('T_F'))
pos_words = pos_words.filter(pos_words['T_F'] == 'false')


neg_words = neg_words.groupBy('indiv_words').count().sort('count', ascending=False)
pos_words = pos_words.groupBy('indiv_words').count().sort('count', ascending=False)


neg_final = neg_words.toPandas()
pos_final = pos_words.toPandas()

print(neg_final.head(5))
print(pos_final.head(5))

  indiv_words  count
0        host    367
1       place    238
2        room    233
3        stay    224
4       would    186
  indiv_words  count
0        stay   1395
1       place   1167
2       great   1114
3        room    851
4      lovely    711


In [None]:
# The most interesting words out of the top 100 words
pos_final.loc[[8,24,26,51,62,81,83,87,92]]

Unnamed: 0,indiv_words,count
8,location,595
24,close,373
26,walk,349
51,station,242
62,tube,220
81,minutes,173
83,area,171
87,bus,166
92,central,158


In [None]:
# The most interesting words out of the top 100 words
neg_final.loc[[0,19,24,29,30,86]]

Unnamed: 0,indiv_words,count
0,host,367
19,reservation,112
24,location,104
29,canceled,98
30,arrival.,95
86,area,47
