In [1]:
import os
import numpy as np
import pandas as pd
from time import time
from utils import aspect_tools

data_dir = "/home/stavros/DATA/AirbnbReviews"
#data_dir = "D:/"

In [3]:
area_dir = os.path.join(data_dir, "athens")
reviews = pd.read_pickle(os.path.join(area_dir, "reviews_with_aspects_379118samples.pkl"))
print(reviews.shape)
reviews = reviews[pd.notnull(reviews["listing_id"])]
reviews = reviews[pd.notnull(reviews["comments"])]
print(reviews.shape)

(379118, 8)
(294403, 8)


In [51]:
listings = pd.read_csv(os.path.join(area_dir, "listings.csv.gz"))
print(listings.shape)

(11338, 106)


  interactivity=interactivity, compiler=compiler, result=result)


In [58]:
for c in listings.columns: print(c)

id
listing_url
scrape_id
last_scraped
name
summary
space
description
experiences_offered
neighborhood_overview
notes
transit
access
interaction
house_rules
thumbnail_url
medium_url
picture_url
xl_picture_url
host_id
host_url
host_name
host_since
host_location
host_about
host_response_time
host_response_rate
host_acceptance_rate
host_is_superhost
host_thumbnail_url
host_picture_url
host_neighbourhood
host_listings_count
host_total_listings_count
host_verifications
host_has_profile_pic
host_identity_verified
street
neighbourhood
neighbourhood_cleansed
neighbourhood_group_cleansed
city
state
zipcode
market
smart_location
country_code
country
latitude
longitude
is_location_exact
property_type
room_type
accommodates
bathrooms
bedrooms
beds
bed_type
amenities
square_feet
price
weekly_price
monthly_price
security_deposit
cleaning_fee
guests_included
extra_people
minimum_nights
maximum_nights
minimum_minimum_nights
maximum_minimum_nights
minimum_maximum_nights
maximum_maximum_nights
minimum_ni

In [52]:
import collections
reviews_per_listing = collections.Counter({k: v for k, v in reviews["listing_id"].value_counts().items()})
reviews["listing_id"].value_counts()

1177492     554
1079291     520
2306865     507
1484797     464
3431705     464
           ... 
34097757      1
13617762      1
24426143      1
32656017      1
19618064      1
Name: listing_id, Length: 8731, dtype: int64

In [106]:
n_reviews = 300
valid_listings = set(ids for ids, n in reviews["listing_id"].value_counts().items() if n > n_reviews)
valid_listings_data = listings[listings["id"].map(lambda x: x in valid_listings)]
print(valid_listings_data.shape)

(30, 106)


In [107]:
cols_to_print = ["id", "review_scores_rating",
"review_scores_accuracy",
"review_scores_cleanliness",
"review_scores_checkin",
"review_scores_communication",
"review_scores_location",
"review_scores_value"]

valid_listings_data[valid_listings_data["review_scores_rating"] < 95][cols_to_print]

Unnamed: 0,id,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
193,1079291,88.0,9.0,9.0,9.0,9.0,9.0,9.0
276,1746907,91.0,9.0,10.0,10.0,10.0,10.0,9.0
277,1751520,91.0,10.0,10.0,10.0,10.0,10.0,9.0
319,2306865,91.0,9.0,9.0,9.0,9.0,10.0,9.0
325,2428866,92.0,10.0,9.0,10.0,10.0,10.0,9.0
336,2579525,93.0,10.0,10.0,10.0,10.0,10.0,10.0
421,3462705,93.0,10.0,9.0,10.0,10.0,10.0,10.0
658,6101639,94.0,10.0,10.0,10.0,10.0,9.0,10.0
779,7215898,90.0,9.0,9.0,10.0,10.0,10.0,9.0
955,9337716,91.0,9.0,9.0,9.0,9.0,10.0,9.0


### Tools

In [91]:
import collections

def collect_listing_aspects(listing_aspect_column, use_score=False):
    all_listing_aspects = collections.Counter()
    for aspects in listing_aspect_column:
        for phrase, score in aspects.items():
            if not isinstance(phrase, str):
                continue
            if use_score:
                to_add = score
            else:
                to_add = 2 * int(score > 0) - 1
            for word in phrase.split(" "):
                all_listing_aspects[word] += to_add
    return all_listing_aspects
        
                
def make_aspects_single_words(aspects):
    new_aspects = collections.Counter()
    for aspect, score in aspects.items():
        if not isinstance(aspect, str):
            continue
        for word in aspect.split(" "):
            new_aspects[word] = score
    return new_aspects

### Explore a particular listing

In [109]:
listing_id = 1079291

listing_reviews = reviews[reviews["listing_id"] == listing_id]
print(listing_reviews.shape)

listing_word_aspects = listing_reviews["aspects"].map(make_aspects_single_words)
all_listing_aspects = collect_listing_aspects(listing_word_aspects)

(520, 8)


In [154]:
def classify_aspects(aspects):
    pos, neg = False, False
    for v in aspects.values():
        pos = pos or v > 0
        neg = neg or v < 0
    if not neg: return 1
    if not pos: return -1
    return 0


def contains_negative(aspects):
    for v in aspects.values():
        if v < 0: return True
    return False

review_sentiment = listing_reviews["aspects"].map(classify_aspects)
reviews_with_negative = listing_reviews["aspects"].map(contains_negative)

In [155]:
review_sentiment.value_counts()

 1    352
 0    162
-1      6
Name: aspects, dtype: int64

In [156]:
reviews_with_negative.value_counts()

False    352
True     168
Name: aspects, dtype: int64

In [110]:
positive_aspects = collections.Counter({k: v for k, v in all_listing_aspects.items() if v > 0})
negative_aspects = collections.Counter({k: -v for k, v in all_listing_aspects.items() if v < 0})

In [161]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

bar_plot_word_index0 = 0
bar_plot_word_index1 = 10

fig = make_subplots(rows=1, cols=2, subplot_titles=("Positive", "Negative"))

bar_plot_words = positive_aspects.most_common()[bar_plot_word_index0: bar_plot_word_index1]
bar_plot_words = [word for word, _ in bar_plot_words]
bar_plot_counts = np.array([positive_aspects[word] for word in bar_plot_words]) * 100.0 / len(listing_reviews)
fig.add_trace(go.Bar(
    y=bar_plot_words,
    x=bar_plot_counts,
    orientation="h",
    name="Positive"), 
              row=1, col=1)


bar_plot_words = negative_aspects.most_common()[bar_plot_word_index0: bar_plot_word_index1]
bar_plot_words = [word for word, _ in bar_plot_words]
bar_plot_counts = np.array([negative_aspects[word] for word in bar_plot_words]) * 100.0 / len(listing_reviews)
fig.add_trace(go.Bar(
    y=bar_plot_words,
    x=bar_plot_counts,
    orientation="h",
    name="Negative"),
             row=1, col=2)

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(width=1000,
                  height=500,
                  font_size=16, 
                  annotations=[
                        go.layout.Annotation(
                            x=0.45,
                            y=-0.15,
                            showarrow=False,
                            text="Aspect occurence per review (%)")],
                  bargap=0.15, # gap between bars of adjacent location coordinates.
                  bargroupgap=0.1, # gap between bars of the same location coordinate.
                  showlegend=False)
fig.show()

In [163]:
labels = ["Positive", "Negative", "N/A"]
n_pos = (reviews_with_negative == False).sum()
n_neg = reviews_with_negative.sum()
n_rest = len(listing_reviews) - n_pos - n_neg
values = [n_pos, n_neg, n_rest]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

Find "cockroach" comment

In [48]:
top_listing_reviews[top_listing_reviews["word_aspects"].map(lambda x: "cockroach" in x)]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,processed_comments,aspects,word_aspects
245412,1177492,174520205,2017-07-26,32385519,Heather,We unfortunately only spent one of three night...,We unfortunately only spent one of three night...,"{'bug': -1, 'baby cockroach': -1, 'location': ...","{'bug': -1, 'baby': -1, 'cockroach': -1, 'loca..."


In [50]:
top_listing_reviews.loc[245412]["comments"]

'We unfortunately only spent one of three nights that we had planned at this apartment. We found two live cockroaches in the bathroom, and then a dead bug in the bed which strongly resembled photos of bedbugs. (We also both had bites after our stay). While bedbugs can happen anywhere, it was very concerning to me that the host did not take the situation seriously. He claimed the bug was a baby cockroach, and offered no type of solution for either type of bug.\n\nThe apartment was in a great location, on a quiet street, and Rio was quick to answer our messages. If he deals with the bug issues, this would be a nice apartment to stay in. Unfortunately, that is the best review I can give.\n\n(It should be noted that Rio gave a full refund for the two nights which we did not stay (but not for the first night). However, we lost a half day of our vacation in Athens, spent at the laundromat washing our things.)'