## Purpose
- **Input:** Airbnb - listings, reviews, single location.
- **Output:** Creating the below files to generate dialogues, train data for conversation recommendation module.
    - listings_info_filter
    - ratings_filter
    - listings_slot_value_filter
    
- 16k listings | 397k reviews

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
ls ../Data/Airbnb

listing_Template.csv  listings3.csv         reviews2.csv
listings.csv          reviews.csv           reviews3.csv
listings2.csv         reviews1.csv


In [3]:
listings = pd.read_csv('../Data/Airbnb/listings.csv')

### Compute unique slot categories and listing jsons.

In [4]:
## These are the columns that will be used in the dialog generation.
listingTemplate = pd.read_csv('../Data/Airbnb/listing_Template.csv')
list(listingTemplate.columns)

['id',
 'picture_url',
 'host_identity_verified',
 'neighbourhood',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location']

In [5]:
listings = listings[listingTemplate.columns]
listings.dropna()
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16116 entries, 0 to 16115
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           16116 non-null  int64  
 1   picture_url                  16116 non-null  object 
 2   host_identity_verified       16111 non-null  object 
 3   neighbourhood                10405 non-null  object 
 4   property_type                16116 non-null  object 
 5   room_type                    16116 non-null  object 
 6   accommodates                 16116 non-null  int64  
 7   bathrooms_text               16095 non-null  object 
 8   bedrooms                     15218 non-null  float64
 9   beds                         16019 non-null  float64
 10  amenities                    16116 non-null  object 
 11  price                        16116 non-null  object 
 12  review_scores_rating         14029 non-null  float64
 13  review_scores_ac

In [6]:
len(listings)

16116

In [7]:
# https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

In [8]:
listings["amenities"] = listings["amenities"].apply(eval)
amenities = to_1D(listings["amenities"]).unique()

In [9]:
def convertList2Dict(lst):
    dict_ret = dict() 
    for index,value in enumerate(lst):
        dict_ret[value] = index
    return dict_ret

In [10]:
neighbourhood = listings["neighbourhood"].dropna().str.split(",", n = 2, expand = True)

listings["City"]= neighbourhood[0]
listings["State"]= neighbourhood[1]
listings["Country"]= neighbourhood[2]

City = listings["City"].dropna().unique()
State = listings["State"].dropna().unique()
Country = listings["Country"].dropna().unique()

In [11]:
listings["price"].min()
listings["price"].max()
listings["price"] = listings["price"].replace('[\$,]', '', regex=True).astype(float)
listings["price"] = pd.cut(listings['price'], [0, 250, 500,750,999], labels=['low', 'average', 'high','expensive'])
price = listings["price"].dropna().unique()

In [12]:
listings["review_scores_rating"].dropna()
listings["review_scores_rating"] = pd.cut(listings['review_scores_rating'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
rating = listings["review_scores_rating"].unique()

In [13]:
listings["review_scores_accuracy"].dropna()
listings["review_scores_accuracy"] = pd.cut(listings['review_scores_accuracy'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
acc_rating = listings["review_scores_accuracy"].unique()

In [14]:
listings["review_scores_cleanliness"].dropna()
listings["review_scores_cleanliness"] = pd.cut(listings['review_scores_cleanliness'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
clean_rating = listings["review_scores_cleanliness"].unique()

In [15]:
listings["review_scores_checkin"].dropna()
listings["review_scores_checkin"] = pd.cut(listings['review_scores_checkin'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','smooth'])
checkin_rating = listings["review_scores_checkin"].unique()

In [16]:
listings["review_scores_communication"].dropna()
listings["review_scores_communication"] = pd.cut(listings['review_scores_communication'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
communication_rating = listings["review_scores_communication"].unique()

In [17]:
listings["review_scores_location"].dropna()
listings["review_scores_location"] = pd.cut(listings['review_scores_location'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
location_rating = listings["review_scores_location"].unique()

In [18]:
host_identity_verified = listings["host_identity_verified"].dropna().unique()    
property_type = listings["property_type"].dropna().unique()    
room_type = listings["room_type"].dropna().unique()    
accommodates = listings["accommodates"].dropna().unique().astype(float)    
bathrooms = listings["bathrooms_text"].dropna().unique()    
bedrooms = listings["bedrooms"].dropna().unique()    
beds = listings["beds"].dropna().unique()    

In [19]:
accommodates=np.delete(accommodates,np.where(accommodates == 0))

In [20]:
amenities_dict = convertList2Dict(amenities)
City_dict = convertList2Dict(City)
State_dict = convertList2Dict(State)
Country_dict = convertList2Dict(Country)
price_dict = convertList2Dict(price)
rating_dict = convertList2Dict(rating)
acc_rating_dict = convertList2Dict(acc_rating)
clean_rating_dict = convertList2Dict(clean_rating)
checkin_rating_dict = convertList2Dict(checkin_rating)
communication_rating_dict = convertList2Dict(communication_rating)
location_rating_dict = convertList2Dict(location_rating)
host_identity_verified_dict = convertList2Dict(host_identity_verified)
property_type_dict = convertList2Dict(property_type)
room_type_dict = convertList2Dict(room_type)
accommodates_dict = convertList2Dict(accommodates)
bathrooms_dict = convertList2Dict(bathrooms)
bedrooms_dict = convertList2Dict(bedrooms)
beds_dict = convertList2Dict(beds)

In [21]:
import json 
dictionary = {
 "amenities":amenities_dict,
 "City": City_dict,
 "State":State_dict,
 "Country":Country_dict,
 "price":price_dict,
 "rating":rating_dict,
 "accuracy":acc_rating_dict,
 "cleanliness":clean_rating_dict,
 "checkin":checkin_rating_dict,
 "communication":communication_rating_dict,
 "location":location_rating_dict,
 "host_identity_verified":host_identity_verified_dict,
 "property_type":property_type_dict,
 "room_type":room_type_dict,
 "accommodates":accommodates_dict, 
 "bathrooms":bathrooms_dict,
 "bedrooms":bedrooms_dict,
 "beds":beds_dict
}
with open('/Users/sudhavijayakumar/Documents/GitHub/conv_rec_sys/data/Airbnb/listings_slot_value_filter.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dictionary, ensure_ascii=False))

In [22]:
with open('/Users/sudhavijayakumar/Documents/GitHub/conv_rec_sys/data/Airbnb/listings_info_filter.json', 'w', encoding='utf-8') as f:
    f.write(listings.to_json(orient = 'records'))

In [23]:
reviews0 = pd.read_csv('../Data/Airbnb/reviews.csv')
reviews0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397185 entries, 0 to 397184
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     397185 non-null  int64 
 1   id             397185 non-null  int64 
 2   date           397185 non-null  object
 3   reviewer_id    397185 non-null  int64 
 4   reviewer_name  397185 non-null  object
 5   comments       396896 non-null  object
dtypes: int64(3), object(3)
memory usage: 18.2+ MB


In [24]:
reviews = reviews0[['listing_id','id','reviewer_id','comments']]

In [25]:
reviews.head(1)

Unnamed: 0,listing_id,id,reviewer_id,comments
0,2818,1191,10952,Daniel is really cool. The place was nice and ...


In [26]:
#test #comment later
# reviews = reviews[:100]

#### Compute rating from review : StandfordNLP Sentiment Analysis

In [27]:
# !pip install pycorenlp
# https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024

In [28]:
from pycorenlp import StanfordCoreNLP
# Need to start the java server.stanford-corenlp-4.3.1 % java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 50000
nlp_wrapper = StanfordCoreNLP('http://localhost:9000')

In [29]:
def computeRatingfromReview(doc):
    overall_sentiment=0
    count=0
    
    annot_doc = nlp_wrapper.annotate(doc,
    properties={
       'annotators': 'sentiment',
       'outputFormat': 'json',
       'timeout': 100000,
    })
    
    try:
        for s in annot_doc["sentences"]:
            sentiment = s["sentiment"]
            if(sentiment=='Verypositive'):
                overall_sentiment = overall_sentiment+5
            elif(sentiment=='Positive'):
                overall_sentiment = overall_sentiment+4
            elif(sentiment=='Neutral'):
                overall_sentiment = overall_sentiment+3
            elif(sentiment=='Negative'):
                overall_sentiment = overall_sentiment+2
            elif(sentiment=='Verynegative'):
                overall_sentiment = overall_sentiment+1
            count+=1

    except:
            return 0
        
    return ((int)(overall_sentiment/count))

In [30]:
len(reviews)

397185

In [31]:
# reviews=reviews[:100]
import time
st = time.time()

In [None]:
reviews['rating'] = reviews['comments']
for x in range(len(reviews.index)):
    try:
        reviews['rating'].iloc[x] = computeRatingfromReview(reviews['comments'].iloc[x])
    except:
        reviews['rating'].iloc[x] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['rating'] = reviews['comments']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
reviews = reviews[['listing_id','reviewer_id','rating']]
reviews = reviews.rename(columns={"listing_id": "business_id", "reviewer_id": "user_id","rating":"stars"})

In [None]:
reviews.head()

In [None]:
reviews.to_csv('/Users/sudhavijayakumar/Documents/GitHub/conv_rec_sys/data/Airbnb/ratings_filter.csv',index=False)

In [None]:
end = time.time()

In [None]:
print(end-start)