### Script
- **Input:** Airbnb - listings, reviews, single location.
- **Output:** Embedded vectors of listings, reviews for traditional recommendation and vector-based 
              airbnb listing search.
    
- 5.5k listings | 250k reviews

In [38]:
!pip install gensim
import pandas as pd
import random
from tqdm import tqdm
import gensim
from gensim.models import Word2Vec 
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[0m

In [39]:
root = './RASA_realtime_recommendation/data/'
model_root='./RASA_realtime_recommendation/offline_models/'
processed = './Data/processing/Processed_Airbnb/'
raw = './Data/raw/'

def create_embeddings(text_corpus_listing, name):
    processed_text_corpus = preprocess_documents(text_corpus_listing)
    tagged_text_corpus = [TaggedDocument(d, [i]) for i, d in enumerate(processed_text_corpus)]
    text_corpus_model = Doc2Vec(tagged_text_corpus, dm=0, vector_size=200, window=2, min_count=1, epochs=100, hs=1)

    text_corpus_model.save(root+'embeddings/'+ name +'_embeddings')

    return text_corpus_model

In [40]:
listings = pd.read_csv(raw+'listings.csv.gz', sep=',')
reviews = pd.read_csv(processed+'ratings_filter.csv', sep=',')
listings=listings.rename(columns={"id": "listing_id"})
listings.to_csv(processed+'listings.csv.gz', sep=',')

In [42]:
listings.head()

Unnamed: 0,listing_id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2818,https://www.airbnb.com/rooms/2818,20211104024252,2021-11-04,Quiet Garden View Room & Super Fast WiFi,Quiet Garden View Room & Super Fast WiFi<br />...,"Indische Buurt (""Indies Neighborhood"") is a ne...",https://a0.muscache.com/pictures/10272854/8dcc...,3159,https://www.airbnb.com/users/show/3159,...,4.98,4.68,4.81,0363 5F3A 5684 6750 D14D,t,1,0,1,0,2.83
1,20168,https://www.airbnb.com/rooms/20168,20211104024252,2021-11-04,Studio with private bathroom in the centre 1,17th century Dutch townhouse in the heart of t...,Located just in between famous central canals....,https://a0.muscache.com/pictures/69979628/fd6a...,59484,https://www.airbnb.com/users/show/59484,...,4.62,4.87,4.49,0363 CBB3 2C10 0C2A 1E29,t,2,0,2,0,3.57
2,27886,https://www.airbnb.com/rooms/27886,20211104024252,2021-11-04,"Romantic, stylish B&B houseboat in canal district",Stylish and romantic houseboat on fantastic hi...,"Central, quiet, safe, clean and beautiful.",https://a0.muscache.com/pictures/02c2da9d-660e...,97647,https://www.airbnb.com/users/show/97647,...,4.92,4.9,4.8,0363 974D 4986 7411 88D8,t,1,0,1,0,2.13
3,28871,https://www.airbnb.com/rooms/28871,20211104024252,2021-11-04,Comfortable double room,<b>The space</b><br />In a monumental house ri...,"Flower market , Leidseplein , Rembrantsplein",https://a0.muscache.com/pictures/160889/362340...,124245,https://www.airbnb.com/users/show/124245,...,4.94,4.97,4.82,0363 607B EA74 0BD8 2F6F,f,2,0,2,0,4.62
4,29051,https://www.airbnb.com/rooms/29051,20211104024252,2021-11-04,Comfortable single room,This room can also accomodate 2 people. For a...,the street is quite lively especially on weeke...,https://a0.muscache.com/pictures/162009/bd6be2...,124245,https://www.airbnb.com/users/show/124245,...,4.9,4.86,4.75,0363 607B EA74 0BD8 2F6F,f,2,0,2,0,5.52


### User Review Embeddings

In [43]:
reviews = reviews.dropna()
print("No.of.reviews:"+str(len(reviews)))
users = reviews["reviewer_id"].unique().tolist()
print("No.of.users:"+str(len(users)))

No.of.reviews:247258
No.of.users:240950


In [44]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247258 entries, 0 to 254720
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   listing_id     247258 non-null  object 
 1   id             247258 non-null  float64
 2   date           247258 non-null  object 
 3   reviewer_id    247258 non-null  float64
 4   reviewer_name  247258 non-null  object 
 5   comments       247258 non-null  object 
 6   rating         247258 non-null  float64
dtypes: float64(3), object(4)
memory usage: 15.1+ MB


In [45]:
comment_corpus_model = create_embeddings(reviews['comments'].values,'review')

In [46]:
new_doc = gensim.parsing.preprocessing.preprocess_string("private room dishwasher")
test_doc_vector = comment_corpus_model.infer_vector(new_doc)
sims = comment_corpus_model.docvecs.most_similar(positive = [test_doc_vector])
topK = 5
for s in sims[:topK]:
    print(f"{(s[1])} | {reviews['listing_id'].iloc[s[0]]}")

0.6776687502861023 | 35632344
0.6480473875999451 | 23626417
0.6259909868240356 | 37927536
0.6101545691490173 | 25310429
0.6100313067436218 | 21978969


  sims = comment_corpus_model.docvecs.most_similar(positive = [test_doc_vector])


### Listing word embeddings

In [47]:
listings = pd.read_csv(raw+'listings.csv.gz', sep=',')
print("No.of.listings:"+str(len(listings)))
listings.info()

No.of.listings:5402
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5402 entries, 0 to 5401
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            5402 non-null   int64  
 1   listing_url                                   5402 non-null   object 
 2   scrape_id                                     5402 non-null   int64  
 3   last_scraped                                  5402 non-null   object 
 4   name                                          5402 non-null   object 
 5   description                                   5392 non-null   object 
 6   neighborhood_overview                         3908 non-null   object 
 7   picture_url                                   5402 non-null   object 
 8   host_id                                       5402 non-null   int64  
 9   host_url                                   

In [49]:
# choose columns to generate embeddings.
listings = listings[['id','listing_url','name','description','neighborhood_overview','picture_url', 
'property_type','room_type','accommodates','bathrooms','bathrooms_text',                               
'bedrooms','beds','amenities','price','minimum_nights','maximum_nights','review_scores_rating',                         
'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
'review_scores_communication','review_scores_location']]

listings.fillna('0', inplace=True)

listings.reset_index(drop = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [50]:
import string
def remove_punc(sample_str):
    # Create translation table in which special charcters
    # are mapped to empty string
    translation_table = str.maketrans('', '', string.punctuation)
    # Remove special characters from the string using translation table
    sample_str = sample_str.translate(translation_table)
    return sample_str

In [51]:
listings['words_features'] = listings['amenities'].apply(remove_punc)

for ind in listings.index:
     listings['review_scores_rating'][ind] = (float(listings['review_scores_rating'][ind]) + float(listings['review_scores_accuracy'][ind]) + float(listings['review_scores_cleanliness'][ind]) + float(listings['review_scores_checkin'][ind]) + float(listings['review_scores_communication'][ind]) + float(listings['review_scores_location'][ind]))
     listings['review_scores_rating'][ind] = (listings['review_scores_rating'][ind])/6
     listings['words_features'][ind] = 'amenities:'+listings['words_features'][ind] +'description:'+  listings['description'][ind] +'neighborhood_overview:'+  listings['neighborhood_overview'][ind]+'property_type:'+  listings['property_type'][ind]+'room_type:'+  listings['room_type'][ind]+'accommodates:'+  str(listings['accommodates'][ind])+'bedrooms:'+  str(listings['bedrooms'][ind])+'beds:'+  str(listings['beds'][ind])+'price range:'+  listings['price'][ind]
listings = listings.rename(columns={"review_scores_rating": "overall_rating"})
listings["overall_rating"].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['words_features'] = listings['amenities'].apply(remove_punc)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['review_scores_rating'][ind] = (float(listings['review_scores_rating'][ind]) + float(listings['review_scores_accuracy'][ind]) + float(listings['review_scores_cleanliness'][ind]) + float(listings['review_scores_checkin'][ind]) + float(listings['review_scores_communication'][ind]) + float(listings['review_scores_location'][ind]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://panda

In [52]:
text_corpus_model = create_embeddings(listings['words_features'].values,'list')

new_doc = gensim.parsing.preprocessing.preprocess_string("private room dishwasher")
test_doc_vector = text_corpus_model.infer_vector(new_doc)
sims = text_corpus_model.docvecs.most_similar(positive = [test_doc_vector])
for s in sims:
    print(f"{(s[1])} | {listings['listing_url'].iloc[s[0]]}")

0.46886196732521057 | https://www.airbnb.com/rooms/25991654
0.44940751791000366 | https://www.airbnb.com/rooms/8641456
0.4420224726200104 | https://www.airbnb.com/rooms/49927889
0.42625418305397034 | https://www.airbnb.com/rooms/16685383
0.4252583384513855 | https://www.airbnb.com/rooms/40247038
0.42134037613868713 | https://www.airbnb.com/rooms/51961840
0.41300126910209656 | https://www.airbnb.com/rooms/32729549
0.4020358622074127 | https://www.airbnb.com/rooms/6356312
0.3997498154640198 | https://www.airbnb.com/rooms/26879765
0.39416182041168213 | https://www.airbnb.com/rooms/28220320


  sims = text_corpus_model.docvecs.most_similar(positive = [test_doc_vector])


### Content-based filtering
- https://github.com/SarangDeshmukh7/Recommendation-Engine/blob/master/Content_Based_Filtering.ipynb

In [53]:
users = reviews["reviewer_id"].unique().tolist()
random.shuffle(users)

In [54]:
reviews_listing = pd.merge(listings,reviews)

In [55]:

reviews_listing['listing_id']= reviews_listing['listing_id'].astype(str)

# extract 90% of user ID's
users_train = [users[i] for i in range(round(0.9*len(users)))]
#split data into train and validation set
train_df = reviews_listing[reviews_listing['reviewer_id'].isin(users_train)]
validation_df = reviews_listing[~reviews_listing['reviewer_id'].isin(users_train)]

TypeError: Invalid argument, not a string or column: 216855.0 of type <class 'float'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [None]:
#list to capture watch history of the users
watch_train = []

# populate the list with the movie ID
for i in tqdm(users_train):
    temp = train_df[train_df["reviewer_id"] == i]["listing_id"].tolist()
    watch_train.append(temp)

100%|██████████| 216855/216855 [02:01<00:00, 1778.99it/s]


In [None]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, 
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14,min_count=1)

model.build_vocab(watch_train, progress_per=200)


In [None]:

model.train(watch_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)
model.save(model_root+'ContentBasedFilter')

In [None]:
watch = train_df[["listing_id","listing_url","name"]]

# remove duplicates
watch.drop_duplicates(inplace=True, subset='listing_id', keep="last")

# create movie id and tittle dictionary
watch_dict = watch.groupby('listing_id')['listing_id'].apply(list).to_dict()

def similar_watch(v, n = 5):

    # extract most similar movies for the input vector
    ms = model.wv.similar_by_vector(v, topn= n+1)[1:]
    # extract name and similarity score of the similar movies
    new_ms = []
    for j in ms:
        pair = (watch_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  watch.drop_duplicates(inplace=True, subset='listing_id', keep="last")


In [None]:
watch_dict

{'570753': ['570753']}

In [None]:
print(watch_dict['570753'])

['570753']


In [None]:
similar_watch('570753',5)

[]

### Collaborative filtering

- Refer notebook ColaborativeFiltering.ipynb

In [None]:
!pip3 install pyspark

# importing all the libraries we’ll require to build the book recommender
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions  import *
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS,ALSModel

# define the configurations for this Spark program
conf = SparkConf().setMaster("local[*]").setAppName("airbnb")
conf.set("spark.executor.memory", "6G")
conf.set("spark.driver.memory", "2G")
conf.set("spark.executor.cores", "4")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.default.parallelism", "4")

# create a Spark Session instead of a Spark Context
spark = SparkSession.builder \
    .config(conf = conf) \
  .appName("spark session example") \
  .getOrCreate()

listings_df = spark.read.option("delimiter", ",").option("header", "true").csv(raw+'listings.csv.gz')
listings_df.show(2)

listings_df.select('id').distinct().show(2)

user_ratings_df = spark.read.option("delimiter", ",").option("header", "true").csv(processed+'ratings_filter.csv')
# Columns User-ID, ISBN and Book-Rating were in string format, which we convert to int
ratings_df = user_ratings_df.withColumn("reviewer_id",
                                        user_ratings_df['reviewer_id'].\
                                        cast(IntegerType())).\
										withColumn("listing_id", user_ratings_df['listing_id'].\
           								cast(IntegerType())).\
    									withColumn("rating",\
                                        user_ratings_df['rating'].\
                                  		cast(IntegerType())).\
        								na.drop()
ratings_df.show(2)

# define parameters
als = ALS(maxIter=5, regParam=0.01, userCol="reviewer_id", itemCol="listing_id", ratingCol="rating",coldStartStrategy="drop")
#fit the model to the ratings
model = als.fit(ratings_df)

model.save(model_root+"als_model")

user_ratings_df = spark.read.option("delimiter", ",").option("header", "true").csv(processed+'ratings_filter.csv')
# Columns User-ID, ISBN and Book-Rating were in string format, which we convert to int
ratings_df = user_ratings_df.withColumn("reviewer_id",
                                        user_ratings_df['reviewer_id'].\
                                        cast(IntegerType())).\
										withColumn("listing_id", user_ratings_df['listing_id'].\
           								cast(IntegerType())).\
    									withColumn("rating",\
                                        user_ratings_df['rating'].\
                                  		cast(IntegerType())).\
        								na.drop()
ratings_df.show(2)

[0m+--------------------+--------------------+--------------+------------+--------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+----------------+------------------+--------------------+-------------------------+------------------+--------------------+----------------------+-------------+----------------------+----------------------------+--------+---------+-------------+---------+------------+---------+--------------+--------+----+---------+-----+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+

Py4JJavaError: An error occurred while calling o212.save.
: java.io.IOException: Path ./RASA_realtime_recommendation/offline_models/als_model already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


### Help link Embeddings

In [None]:
import json
from typing import Any, Text, Dict, List
import torch
from bert_serving.client import BertClient
from rasa_sdk import Action, Tracker
from rasa_sdk.executor import CollectingDispatcher
import numpy as np
from sentence_transformers import SentenceTransformer

# sentence embedding selection
sentence_transformer_select=True
pretrained_model='stsb-roberta-large' # Refer: https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md
score_threshold = 0.70  # This confidence scores can be adjusted based on your need!!

In [None]:

def encode_standard_question(sentence_transformer_select=True, pretrained_model='bert-base-nli-mean-tokens'):
    """
    This will encode all the questions available in question database into sentence embedding. The result will be stored into numpy array for comparision purpose.
    """
    if sentence_transformer_select:
        bc = SentenceTransformer(pretrained_model)
    else:
        bc = BertClient(check_version=False)
    data = json.load(open(raw+"/faq.json", "rt", encoding="utf-8"))
    standard_questions = [each['q'].replace('-',' ') for each in data]
    print("Standard question size", len(standard_questions))
    print("Start to calculate encoder....")
    if sentence_transformer_select:
        standard_questions_encoder = torch.tensor(bc.encode(standard_questions)).numpy()
    else:
        standard_questions_encoder = bc.encode(standard_questions)
    np.save(root+"embeddings/questions_embedding", standard_questions_encoder)
    standard_questions_encoder_len = np.sqrt(np.sum(standard_questions_encoder * standard_questions_encoder, axis=1))
    np.save(root+"embeddings/questions_embedding_len", standard_questions_encoder_len)


encode_standard_question(sentence_transformer_select,pretrained_model)

Standard question size 86
Start to calculate encoder....


In [None]:
import time
print(str(time.time()))