## Purpose
- **Input:** Airbnb - listings, reviews, single location.
- **Output:** Creating the below files to generate dialogues, train data for conversation recommendation module.
    - listings_info_filter
    - ratings_filter
    - listings_slot_value_filter
    - Neo4j_nodes.csv
    
- 5.5k listings | 250k reviews

### References:

- https://neo4j.com/developer/guide-import-csv/

### Import libraries, raw data

In [50]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [51]:
ls

1a_Get_AirbnbData.ipynb
1b_quora-scrapper[inActive].ipynb
1c_Twitter-read.ipynb
2a_SentimentAnalyzer.ipynb
2b_FAQ_Questgen_NLP.ipynb
2c_Neo4jData_Gen.ipynb
3a_EmbeddingsAndModels.ipynb
4a_GenerateRASA_Conversational_Data.ipynb
5a_offlineRecommendation.ipynb
5b_realTimeRecommendation.ipynb
6a_realTimeRecommendation.ipynb
[34mData[m[m/
[34mDocuments[m[m/
LICENSE
[34mPapers[m[m/
[34mPersonas[m[m/
[34mRASA[m[m/
README.md
Twitter_APIKey.ipynb


In [52]:
root='./Data/raw/'
processor = './Data/processing/'

In [53]:
listings = pd.read_csv(root+'listings.csv.gz')


In [54]:
## These are the columns that will be used in the dialog generation.
listingTemplate = pd.read_csv(processor+'listing_Template.csv')

### Compute unique entities, create Neo4j nodes & edge map.

In [55]:
listings = listings[listingTemplate.columns]
listings.dropna()
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5402 entries, 0 to 5401
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5402 non-null   int64  
 1   picture_url                  5402 non-null   object 
 2   host_identity_verified       5401 non-null   object 
 3   neighbourhood                3908 non-null   object 
 4   neighbourhood_cleansed       5402 non-null   object 
 5   property_type                5402 non-null   object 
 6   room_type                    5402 non-null   object 
 7   accommodates                 5402 non-null   int64  
 8   bathrooms_text               5388 non-null   object 
 9   bedrooms                     5081 non-null   float64
 10  beds                         5163 non-null   float64
 11  amenities                    5402 non-null   object 
 12  price                        5402 non-null   object 
 13  review_scores_rati

In [56]:
len(listings)

5402

In [57]:
### Add Geo-location, check more efficient way to add location.
listings['City'] = 'Amsterdam'
listings['Country'] = 'Netherlands'
listings[listings['neighbourhood_cleansed'].isna()]
listings['neighbourhood'] = listings['neighbourhood'].fillna(listings['neighbourhood_cleansed'])
listings['neighbourhood'] = listings['neighbourhood'].str.strip()
listings['State'] = listings['neighbourhood']

#### Add all unique entities

In [58]:
# https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

In [59]:
listings["amenities"] = listings["amenities"].apply(eval)
amenities = to_1D(listings["amenities"]).unique()

In [60]:
### Add listings text description
# 

col_parse = ['id',
 'picture_url',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'price',
 'City',
 'Country',
 'State']

listings['Listing_Text'] = listings['City']

col = list(listings.columns.values)
for i,j in listings.iterrows():
    for c_name in col_parse:
        listings['Listing_Text'][i] = listings['Listing_Text'][i] + str(c_name) + ':' + str(j[str(c_name)])+';'

In [61]:
#[_id,_labels,name]
_id=[]
_labels=[]
name=[]
_start = []
_end = []
_rating = []
_relationship=[]

_listing_url=[]
_picture_url=[]
_host_identity_verified=[]
_accomodates=[]
_bedrooms=[]
_bathrooms=[]
_beds=[]
_price=[]
_review_scores_rating=[]

def convertList2Dict(lst,prev_index,label):
    dict_ret = dict() 
    for index,value in enumerate(lst):
        rec_index = index + prev_index
        dict_ret[value] = int(rec_index)

        ##
        _id.append(int(rec_index))
        _labels.append(label)
        name.append(value)
        _start.append(np.NaN)
        _end.append(np.NaN)
        _relationship.append(np.NaN)
        _rating.append(np.NaN)
        _listing_url.append(np.NaN)
        _picture_url.append(np.NaN)
        _host_identity_verified.append(np.NaN)
        _accomodates.append(np.NaN)
        _bedrooms.append(np.NaN)
        _bathrooms.append(np.NaN)
        _beds.append(np.NaN)
        _price.append(np.NaN)
        _review_scores_rating.append(np.NaN)

    prev_index = int(rec_index)
    return dict_ret, prev_index

In [62]:
City = listings["City"].dropna().unique()
State = listings["State"].dropna().unique()
Country = listings["Country"].dropna().unique()
Listing_Text = listings["Listing_Text"].dropna().unique()

In [63]:
# listings["price"].min()
# listings["price"].max()
# listings["price"] = listings["price"].replace('[\$,]', '', regex=True).astype(float)
# listings["price"] = pd.cut(listings['price'], [0, 250, 500,750,999], labels=['low', 'average', 'high','expensive'])
price = listings["price"].dropna().unique()

In [64]:
listings["review_scores_rating"].dropna()
listings["review_scores_rating"] = pd.cut(listings['review_scores_rating'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
rating = listings["review_scores_rating"].unique()

In [65]:
listings["review_scores_accuracy"].dropna()
listings["review_scores_accuracy"] = pd.cut(listings['review_scores_accuracy'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
acc_rating = listings["review_scores_accuracy"].unique()

In [66]:
listings["review_scores_cleanliness"].dropna()
listings["review_scores_cleanliness"] = pd.cut(listings['review_scores_cleanliness'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
clean_rating = listings["review_scores_cleanliness"].unique()

In [67]:
listings["review_scores_checkin"].dropna()
listings["review_scores_checkin"] = pd.cut(listings['review_scores_checkin'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','smooth'])
checkin_rating = listings["review_scores_checkin"].unique()

In [68]:
listings["review_scores_communication"].dropna()
listings["review_scores_communication"] = pd.cut(listings['review_scores_communication'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
communication_rating = listings["review_scores_communication"].unique()

In [69]:
listings["review_scores_location"].dropna()
listings["review_scores_location"] = pd.cut(listings['review_scores_location'], [0, 1, 2, 3,4,5], labels=['horrible', 'bad', 'average','good','very good'])
location_rating = listings["review_scores_location"].unique()

In [70]:
host_identity_verified = listings["host_identity_verified"].dropna().unique()    
property_type = listings["property_type"].dropna().unique()    
room_type = listings["room_type"].dropna().unique()    
accommodates = listings["accommodates"].dropna().unique().astype(float)    
bathrooms = listings["bathrooms_text"].dropna().unique()    
bedrooms = listings["bedrooms"].dropna().unique()    
beds = listings["beds"].dropna().unique()    

In [71]:
accommodates=np.delete(accommodates,np.where(accommodates == 0))

In [72]:
amenities_dict, last_index = convertList2Dict(amenities,0,':Amenities')
City_dict, last_index = convertList2Dict(City,(last_index+1),':City')
State_dict, last_index = convertList2Dict(State,(last_index+1),':State')
Country_dict, last_index = convertList2Dict(Country,(last_index+1),':Country')
price_dict, last_index = convertList2Dict(price,(last_index+1),':price')
property_type_dict, last_index = convertList2Dict(property_type,(last_index+1),':property_type')
room_type_dict, last_index = convertList2Dict(room_type,(last_index+1),':room_type')
accommodates_dict, last_index = convertList2Dict(accommodates,(last_index+1),':accommodates')
bathrooms_dict, last_index = convertList2Dict(bathrooms,(last_index+1),':bathrooms')
bedrooms_dict, last_index = convertList2Dict(bedrooms,(last_index+1),':bedrooms')
beds_dict, last_index = convertList2Dict(beds,(last_index+1),':beds')
listingText_dict, last_index = convertList2Dict(Listing_Text,(last_index+1),':Listing_Text')

In [73]:
import json 
dictionary = {
 "amenities":amenities_dict,
 "City": City_dict,
 "State":State_dict,
 "Country":Country_dict,
 "price":price_dict,
 "property_type":property_type_dict,
 "room_type":room_type_dict,
 "accommodates":accommodates_dict, 
 "bathrooms":bathrooms_dict,
 "bedrooms":bedrooms_dict,
 "beds":beds_dict,
 "Listing_Text":listingText_dict
}
with open(processor+'Processed_Airbnb/listings_entities_filter.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dictionary, ensure_ascii=False))

In [74]:
df = pd.read_json(processor+'Processed_Airbnb/listings_entities_filter.json')
df.to_csv(processor+'Processed_Airbnb/listings_entities_filter.csv')

In [75]:
with open(processor+'Processed_Airbnb/listings_info_filter.json', 'w', encoding='utf-8') as f:
    f.write(listings.to_json(orient = 'records'))

In [76]:
reviews0 = pd.read_csv(processor+'Processed_Airbnb/ratings_filter.csv')
reviews0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254721 entries, 0 to 254720
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   listing_id     254721 non-null  object 
 1   id             251546 non-null  float64
 2   date           249402 non-null  object 
 3   reviewer_id    249402 non-null  float64
 4   reviewer_name  249402 non-null  object 
 5   comments       249373 non-null  object 
 6   rating         247258 non-null  float64
dtypes: float64(3), object(4)
memory usage: 13.6+ MB


In [77]:
reviews=reviews0.dropna()
reviews = reviews[['listing_id','id','reviewer_id','comments','rating']]

#### Add users

In [78]:
reviewer = reviews['reviewer_id'].unique() 
for user in reviewer:
    if user>1:
        last_index = last_index+1
        _id.append(last_index)
        _labels.append(':User')
        name.append(int(user))
        _start.append(np.NaN)
        _end.append(np.NaN)
        _relationship.append(np.NaN)
        _rating.append(np.NaN)
        _listing_url.append(np.NaN)
        _picture_url.append(np.NaN)
        _host_identity_verified.append(np.NaN)
        _accomodates.append(np.NaN)
        _bedrooms.append(np.NaN)
        _bathrooms.append(np.NaN)
        _beds.append(np.NaN)
        _price.append(np.NaN)
        _review_scores_rating.append(np.NaN)

#### Add listings

In [79]:
for index in listings.index:
    last_index = last_index+1
    _id.append(last_index)
    _labels.append(':Listing')
    name.append(listings['id'][index])
    _start.append(np.NaN)
    _end.append(np.NaN)
    _relationship.append(np.NaN)
    _rating.append(np.NaN)

    _listing_url.append('https://www.airbnb.com/rooms/'+str(listings['id'][index]))
    _picture_url.append(listings['picture_url'][index])
    _host_identity_verified.append(listings['host_identity_verified'][index])
    _accomodates.append(listings['accommodates'][index])
    _bedrooms.append(listings['bedrooms'][index])
    _bathrooms.append(listings['bathrooms_text'][index])
    _beds.append(listings['beds'][index])
    _price.append(listings['price'][index])
    _review_scores_rating.append(listings['review_scores_rating'][index])
        

In [80]:
#form a list of (u_id,l_id)

data = {'_id':_id,
        '_labels':_labels,
        'name':name,
        '_start':_start,
        '_end':_end,
        '_type':_relationship,
        'rating':_rating,
        'url':_listing_url,
        'picture_url':_picture_url,
        'host_identity_verified':_host_identity_verified,
        'accomodates':_accomodates,
        'bedrooms':_bedrooms,
        'bathrooms':_bathrooms,
        'beds':_beds,
        'price':_price,
        'review_scores_rating':_review_scores_rating
        }

#Create DataFrame
neo4J_format_df = pd.DataFrame(data)
neo4J_format_df.to_csv(processor+'Neo4j/neo4J_nodes.csv',index=False)

#### Create edge maps

In [81]:
# Store listing ids
# Store user ids

neoNodes = pd.read_csv(processor+'Neo4j/neo4J_nodes.csv')

In [82]:
neoListings = neoNodes[neoNodes['_labels']==':Listing']
neoAmenities = neoNodes[neoNodes['_labels']==':Amenities']
neoCity = neoNodes[neoNodes['_labels']==':City']
neoState = neoNodes[neoNodes['_labels']==':State']
neoCountry = neoNodes[neoNodes['_labels']==':Country']
neoProperty_type = neoNodes[neoNodes['_labels']==':property_type']
neoRoom_type = neoNodes[neoNodes['_labels']==':room_type']
neoUser = neoNodes[neoNodes['_labels']==':User']
neoListingText = neoNodes[neoNodes['_labels']==':Listing_Text']

print('Unique listings:'+str(len(neoListings)))
print('Unique Amenities:'+str(len(neoAmenities)))
print('Unique City:'+str(len(neoCity)))
print('Unique State:'+str(len(neoState)))
print('Unique Country:'+str(len(neoCountry)))
print('Unique property_type:'+str(len(neoProperty_type)))
print('Unique room_type:'+str(len(neoRoom_type)))
print('Unique User:'+str(len(neoUser)))
print('Unique listings:'+str(len(neoListingText)))

Unique listings:5402
Unique Amenities:889
Unique City:1
Unique State:72
Unique Country:1
Unique property_type:57
Unique room_type:4
Unique User:240950
Unique listings:5402


In [83]:
neoAmenities1 = neoAmenities[['_id','name','_labels']]
neoAmenities1['_labels'] = 'Amenity; Amenity'
neoAmenities1 = neoAmenities.rename(columns={"_id": "id:ID(Amenity)", "_labels": ":LABEL"})
neoAmenities1.to_csv(processor+"Neo4j/Amenity.csv",index=False)

neoCity1 = neoCity[['_id','name','_labels']]
neoCity1['_labels'] = 'City; City'
neoCity1 = neoCity1.rename(columns={"_id": "id:ID(City)", "_labels": ":LABEL"})
neoCity1.to_csv(processor+"Neo4j/City.csv",index=False)

neoState1 = neoState[['_id','name','_labels']]
neoState1['_labels'] = 'State; State'
neoState1 = neoState1.rename(columns={"_id": "id:ID(State)", "_labels": ":LABEL"})
neoState1.to_csv(processor+"Neo4j/State.csv",index=False)

neoCountry1 = neoCountry[['_id','name','_labels']]
neoCountry1['_labels'] = 'Country; Country'
neoCountry1 = neoCountry1.rename(columns={"_id": "id:ID(Country)", "_labels": ":LABEL"})
neoCountry1.to_csv(processor+"Neo4j/Country.csv",index=False)

neoProperty_type1 = neoProperty_type[['_id','name','_labels']]
neoProperty_type1['_labels'] = 'Property_type; Property_type'
neoProperty_type1 = neoProperty_type1.rename(columns={"_id": "id:ID(property_type)", "_labels": ":LABEL"})
neoProperty_type1.to_csv(processor+"Neo4j/Property_type.csv",index=False)

neoRoom_type1 = neoRoom_type[['_id','name','_labels']]
neoRoom_type1['_labels'] = 'Room_type; Room_type'
neoRoom_type1 = neoRoom_type1.rename(columns={"_id": "id:ID(room_type)", "_labels": ":LABEL"})
neoRoom_type1.to_csv(processor+"Neo4j/Room_type.csv",index=False)

neoListingText1 = neoListingText[['_id','name','_labels']]
neoListingText1['_labels'] = 'Listing_Text; Listing_Text'
neoListingText1 = neoListingText1.rename(columns={"_id": "id:ID(Listing_Text)", "_labels": ":LABEL"})
neoListingText1.to_csv(processor+"Neo4j/Listing_Text.csv",index=False)


In [84]:
neoUser1 = neoUser[['_id','name','_labels']]
neoUser1['_labels'] = 'User; User'
neoUser1 = neoUser1.rename(columns={"_id": "id:ID(User)", "_labels": ":LABEL"})
neoUser1.to_csv(processor+"Neo4j/User.csv",index=False)

In [85]:
neoListings1 = neoListings[['_id','name','_labels','url','picture_url','host_identity_verified','accomodates','bedrooms','bathrooms','beds','price','review_scores_rating']]
neoListings1['_labels'] = 'Listing; Listing'
neoListings1 = neoListings1.rename(columns={"_id": "id:ID(Listing)", "_labels": ":LABEL"})
neoListings1.to_csv(processor+"Neo4j/Listings.csv",index=False)

### Define headers

In [86]:
amenities_header = pd.DataFrame(columns=[':END_ID(Amenity)',':START_ID(Listing)'])
amenities_header.to_csv(processor+"Neo4j/amenity_header.csv",index=False)

city_header = pd.DataFrame(columns=[':END_ID(City)',':START_ID(Listing)'])
city_header.to_csv(processor+"Neo4j/city_header.csv",index=False)

state_header = pd.DataFrame(columns=[':END_ID(State)',':START_ID(Listing)'])
state_header.to_csv(processor+"Neo4j/state_header.csv",index=False)

country_header = pd.DataFrame(columns=[':END_ID(Country)',':START_ID(Listing)'])
country_header.to_csv(processor+"Neo4j/country_header.csv",index=False)

property_type_header = pd.DataFrame(columns=[':END_ID(property_type)',':START_ID(Listing)'])
property_type_header.to_csv(processor+"Neo4j/property_type_header.csv",index=False)

room_type_header = pd.DataFrame(columns=[':END_ID(room_type)',':START_ID(Listing)'])
room_type_header.to_csv(processor+"Neo4j/room_type_header.csv",index=False)

listing_text_header = pd.DataFrame(columns=[':END_ID(Listing_Text)',':START_ID(Listing)'])
listing_text_header.to_csv(processor+"Neo4j/listing_text_header.csv",index=False)

user_rating_header = pd.DataFrame(columns=[':END_ID(Listing)','RATED',':START_ID(User)'])
user_rating_header.to_csv(processor+"Neo4j/user_rating_header.csv",index=False)

### Construct edges

In [87]:
def UserListingRating(neoListings,neoUser,reviews):
    neoListings['listing_id']=neoListings['name']
    neoListings['listing_id'] = (neoListings['listing_id']).astype('str')
    neoListings = neoListings[['listing_id','_id']]
    trans_df = neoListings.set_index("listing_id").T
    neoListingsDict = trans_df.to_dict("records")

    reviews=reviews.replace({"listing_id": neoListingsDict[0]})
    reviews['_end'] = reviews['listing_id']

    neoUser['reviewer_id']= neoUser['name']
    neoUser['reviewer_id'] = (neoUser['reviewer_id']).astype('int')
    neoUser = neoUser[['reviewer_id','_id']]
    trans_df = neoUser.set_index("reviewer_id").T
    neoUserDict = trans_df.to_dict("records")

    # print(neoUserDict)
    reviews=reviews.replace({"reviewer_id": neoUserDict[0]})
    # print(reviews)

    reviews['_start'] = (reviews['reviewer_id']).astype('int')
    reviews['_end'] = (reviews['listing_id']).astype('int')
    reviews['_type'] = 'RATED'

    reviews = reviews[['_end','rating','_start']]

    reviews.to_csv(processor+'Neo4j/user_rating.csv',index=False,header=False)
    
UserListingRating(neoListings,neoUser,reviews)

In [88]:

def ListingEdges(neoListings,neoEdge,listings,edge,type,id_col):
    neoListings['listing_id']=neoListings['name']
    neoListings['listing_id'] = (neoListings['listing_id'])
    neoListings = neoListings[['listing_id','_id']]
    trans_df = neoListings.set_index("listing_id").T
    neoListingsDict = trans_df.to_dict("records")
    listings=listings.replace({"id": neoListingsDict[0]})

    listings['_start'] = listings['id']

    neoEdge[edge]= neoEdge['name']
    neoEdge[edge] = (neoEdge[edge]).astype(str)
    neoEdge = neoEdge[[edge,'_id']]
    trans_df = neoEdge.set_index(edge).T
    neoEdgeDict = trans_df.to_dict("records")

    listings=listings.replace({edge: neoEdgeDict[0]})
    listings['_end'] = listings[edge]
    
    listings['_type'] = type

    header_list = ['_id','_labels','name', '_start','_end','_type','rating','url','picture_url','host_identity_verified','accomodates','bedrooms','bathrooms','beds','price','review_scores_rating']
    listings = listings.reindex(columns = header_list)   
    listings = listings[['_end','_start']]
    listings.to_csv(processor+'Neo4j/'+ edge +'_Listing.csv',index=False,header=False)

col_name = 'City'
listingCity = listings[['id',col_name]]   
ListingEdges(neoListings,neoCity,listingCity,col_name,'IN_CITY','id:ID(City)')

col_name = 'State'
listingState = listings[['id',col_name]]   
ListingEdges(neoListings,neoState,listingState,col_name,'IN_STATE','id:ID(State)')

col_name = 'Country'
listingCountry = listings[['id',col_name]]   
ListingEdges(neoListings,neoCountry,listingCountry,col_name,'IN_COUNTRY','id:ID(Country)')

col_name = 'property_type'
listingPropType = listings[['id',col_name]]   
ListingEdges(neoListings,neoProperty_type,listingPropType,col_name,'HAS_PROPERTY_TYPE','id:ID(property_type)')

col_name = 'room_type'
listingRoomType = listings[['id',col_name]]   
ListingEdges(neoListings,neoRoom_type,listingRoomType,col_name,'HAS_ROOM_TYPE','id:ID(room_type)')

col_name = 'Listing_Text'
listingText = listings[['id',col_name]]   
ListingEdges(neoListings,neoListingText,listingText,col_name,'HAS_TEXT','id:ID(Listing_Text)')

In [None]:
def ListingAmenities(listings):
    ListingID=[]
    AmenityID=[]
    for index in listings.index:
        try:    
            listing_id = int(listings['id'][index])
            if(listing_id>=1):
                try:
                    #nodes
                    amenities = listings['amenities'][index]
                    for every_amenity in amenities: 
                        ListingID.append(listing_id)
                        AmenityID.append(every_amenity)
                except:
                    print('No amenties found')
        except:
            print('listing id null')
        
    data={'id':ListingID,'Amenity':AmenityID}
    listingsAmenity = pd.DataFrame(data)
    return listingsAmenity

In [None]:
listingsAmenity=ListingAmenities(listings)
col_name = 'Amenity'
listingAmenities = listingsAmenity[['id',col_name]]   
ListingEdges(neoListings,neoAmenities,listingAmenities,col_name,'HAS_AMENITY','aId:ID(Amenity)')