In [1]:
import pandas as pd
import gensim
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# References:
- https://rasa.com/docs/rasa/testing-your-assistant/


In [2]:
root = '/Users/sudhavijayakumar/Documents/299/299A-SMARTRec/RASA/data/'

def create_embeddings(text_corpus_listing, name):
    processed_text_corpus = preprocess_documents(text_corpus_listing)
    tagged_text_corpus = [TaggedDocument(d, [i]) for i, d in enumerate(processed_text_corpus)]
    text_corpus_model = Doc2Vec(tagged_text_corpus, dm=0, vector_size=200, window=2, min_count=1, epochs=100, hs=1)

    text_corpus_model.save(root+'embeddings/'+ name +'_embeddings')

    return text_corpus_model

### User Review Embeddings

In [3]:
reviews = pd.read_csv(root+'reviews.csv.gz', sep=',')


In [4]:
reviews = reviews.dropna()

In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 396896 entries, 0 to 397184
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     396896 non-null  int64 
 1   id             396896 non-null  int64 
 2   date           396896 non-null  object
 3   reviewer_id    396896 non-null  int64 
 4   reviewer_name  396896 non-null  object
 5   comments       396896 non-null  object
dtypes: int64(3), object(3)
memory usage: 21.2+ MB


In [6]:
comment_corpus_model = create_embeddings(reviews['comments'].values,'review')

new_doc = gensim.parsing.preprocessing.preprocess_string("private room dishwasher")
test_doc_vector = comment_corpus_model.infer_vector(new_doc)
sims = comment_corpus_model.docvecs.most_similar(positive = [test_doc_vector])
for s in sims:
    print(f"{(s[1])} | {reviews['listing_id'].iloc[s[0]]}")

  sims = comment_corpus_model.docvecs.most_similar(positive = [test_doc_vector])


0.6321595907211304 | 20498694
0.631978452205658 | 27713691
0.6249212622642517 | 35632344
0.6225887537002563 | 21486402
0.6206820011138916 | 23649021
0.6123455762863159 | 31136615
0.612194299697876 | 30041745
0.6087973713874817 | 21979569
0.6086391806602478 | 21327054
0.6083998084068298 | 25765951


### Listing embeddings

In [29]:
listings = pd.read_csv(root+'listings.csv.gz', sep=',')
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16116 entries, 0 to 16115
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            16116 non-null  int64  
 1   listing_url                                   16116 non-null  object 
 2   scrape_id                                     16116 non-null  int64  
 3   last_scraped                                  16116 non-null  object 
 4   name                                          16086 non-null  object 
 5   description                                   15893 non-null  object 
 6   neighborhood_overview                         10405 non-null  object 
 7   picture_url                                   16116 non-null  object 
 8   host_id                                       16116 non-null  int64  
 9   host_url                                      16116 non-null 

In [30]:
# drop columns that have no value to our recommendation
listings = listings[['id','listing_url','name','description','neighborhood_overview','picture_url', 
'property_type','room_type','accommodates','bathrooms','bathrooms_text',                               
'bedrooms','beds','amenities','price','minimum_nights','maximum_nights','review_scores_rating',                         
'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
'review_scores_communication','review_scores_location']]

listings.fillna('0', inplace=True)

listings.reset_index(drop = True, inplace = True)

In [31]:
import string
def remove_punc(sample_str):
    # Create translation table in which special charcters
    # are mapped to empty string
    translation_table = str.maketrans('', '', string.punctuation)
    # Remove special characters from the string using translation table
    sample_str = sample_str.translate(translation_table)
    return sample_str

In [33]:
listings['words_features'] = listings['amenities'].apply(remove_punc)

for ind in listings.index:
     listings['review_scores_rating'][ind] = (float(listings['review_scores_rating'][ind]) + float(listings['review_scores_accuracy'][ind]) + float(listings['review_scores_cleanliness'][ind]) + float(listings['review_scores_checkin'][ind]) + float(listings['review_scores_communication'][ind]) + float(listings['review_scores_location'][ind]))
     listings['review_scores_rating'][ind] = (listings['review_scores_rating'][ind])/6
     listings['words_features'][ind] = 'amenities:'+listings['words_features'][ind] +'description:'+  listings['description'][ind] +'neighborhood_overview:'+  listings['neighborhood_overview'][ind]+'property_type:'+  listings['property_type'][ind]+'room_type:'+  listings['room_type'][ind]+'accommodates:'+  str(listings['accommodates'][ind])+'bedrooms:'+  str(listings['bedrooms'][ind])+'beds:'+  str(listings['beds'][ind])+'price range:'+  listings['price'][ind]
listings = listings.rename(columns={"review_scores_rating": "overall_rating"})
listings["overall_rating"].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['review_scores_rating'][ind] = (float(listings['review_scores_rating'][ind]) + float(listings['review_scores_accuracy'][ind]) + float(listings['review_scores_cleanliness'][ind]) + float(listings['review_scores_checkin'][ind]) + float(listings['review_scores_communication'][ind]) + float(listings['review_scores_location'][ind]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['review_scores_rating'][ind] = (listings['review_scores_rating'][ind])/6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [38]:
text_corpus_model = create_embeddings(listings['words_features'].values,'list')

new_doc = gensim.parsing.preprocessing.preprocess_string("private room dishwasher")
test_doc_vector = text_corpus_model.infer_vector(new_doc)
sims = text_corpus_model.docvecs.most_similar(positive = [test_doc_vector])
for s in sims:
    print(f"{(s[1])} | {listings['listing_url'].iloc[s[0]]}")

0.5167169570922852 | https://www.airbnb.com/rooms/35342963
0.5036830902099609 | https://www.airbnb.com/rooms/45215495
0.4965446889400482 | https://www.airbnb.com/rooms/37290247
0.494057297706604 | https://www.airbnb.com/rooms/49927889
0.4756770133972168 | https://www.airbnb.com/rooms/26949958
0.4725114703178406 | https://www.airbnb.com/rooms/16190593
0.4705186188220978 | https://www.airbnb.com/rooms/21291610
0.47044721245765686 | https://www.airbnb.com/rooms/28379697
0.46984609961509705 | https://www.airbnb.com/rooms/29307841
0.46676674485206604 | https://www.airbnb.com/rooms/18922329


  sims = text_corpus_model.docvecs.most_similar(positive = [test_doc_vector])


### FAQ Embeddings

In [3]:
import json
from typing import Any, Text, Dict, List
import torch
from bert_serving.client import BertClient
from rasa_sdk import Action, Tracker
from rasa_sdk.executor import CollectingDispatcher
import numpy as np
from sentence_transformers import SentenceTransformer

# sentence embedding selection
sentence_transformer_select=True
pretrained_model='stsb-roberta-large' # Refer: https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md
score_threshold = 0.70  # This confidence scores can be adjusted based on your need!!

In [4]:

def encode_standard_question(sentence_transformer_select=True, pretrained_model='bert-base-nli-mean-tokens'):
    """
    This will encode all the questions available in question database into sentence embedding. The result will be stored into numpy array for comparision purpose.
    """
    if sentence_transformer_select:
        bc = SentenceTransformer(pretrained_model)
    else:
        bc = BertClient(check_version=False)
    data = json.load(open("/Users/sudhavijayakumar/Documents/GitHub/CMPE252_StayRec/data/faq.json", "rt", encoding="utf-8"))
    standard_questions = [each['q'].replace('-',' ') for each in data]
    print("Standard question size", len(standard_questions))
    print("Start to calculate encoder....")
    if sentence_transformer_select:
        standard_questions_encoder = torch.tensor(bc.encode(standard_questions)).numpy()
    else:
        standard_questions_encoder = bc.encode(standard_questions)
    np.save("/Users/sudhavijayakumar/Documents/GitHub/CMPE252_StayRec/data/questions_embedding", standard_questions_encoder)
    standard_questions_encoder_len = np.sqrt(np.sum(standard_questions_encoder * standard_questions_encoder, axis=1))
    np.save("/Users/sudhavijayakumar/Documents/GitHub/CMPE252_StayRec/data/questions_embedding_len", standard_questions_encoder_len)


encode_standard_question(sentence_transformer_select,pretrained_model)

Standard question size 86
Start to calculate encoder....


#### Create nlu 

In [None]:
import pandas as pd

#FAQ.json file generated by: /Users/sudhavijayakumar/Documents/299/299A-SMARTRec/QA DataScrappers/Get_AirbnbData.ipynb
df = pd.read_json('/Users/sudhavijayakumar/Documents/GitHub/CMPE252_StayRec/data/faq.json',orient='records')

nlu_yml_file = open('/Users/sudhavijayakumar/Documents/GitHub/CMPE252_StayRec/data/nlu.yml', 'a')
nlu_yml_file.write('\n\n- intent: faq')
nlu_yml_file.write('\n  examples: |')

for ind in df.index:
    nlu_yml_file.write('\n    - Need help about '+(df['q'][ind]).replace('-',' '))

nlu_yml_file.close()