### Use Mistral to extract keywords from a passage

In [None]:
import ollama

comment = "Me and two friends stayed for four and a half months. It was a great place to stay!  The apartment was very comfortable and I really enjoyed having the park with running path across the street. The only downside was it was not within walking distance to restaurants."
response = ollama.chat(model='mistral', messages=[
   {
     'role': 'user',
     'content': 'Can you extract keywords and phrases from this passage and put them into a comma separated list: ' + comment,
     'stream': False
   },
 ])

print(response['message']['content'])

### Extract keywords from Airbnb users reviews using Mistral and output them to a file

In [None]:
import sys
import datetime
import json
import os
import re

import pandas as pd
import numpy as np

import ollama
import time

df_reviews = pd.read_json('/Data/Airbnb/LA_Dec_2023/reviews-20240216.ndjson', lines=True)
print(df_reviews.columns) # what columns
print(df_reviews.shape)   # num of rows, columns
print(df_reviews.iloc[11]['comments']) # for example

def extractKeyPhrases(comments):
    print("comment is: " + comments)
    response = ollama.generate(
        model='mistral', 
        prompt='Can you extract keywords and phrases from a passage and put them into a comma separated list only, without any additional header information. \
            Please do not add sentences like here is a list. \
            Also, please omit names of people like Hank or Chas or Maianna. Here is the passage.' + comments, 
        stream=False)
    print("response: " + response['response'])
    return(response['response'])

file = open("keyphrases.txt", "w")
for index, row in df_reviews.iterrows():
    if(index > 0):
      comments = row['comments']
      phrases = extractKeyPhrases(comments)
      file.write(phrases+"\n")
      time.sleep(5)
file.close()

### Connect to Elasticsearch

In [2]:
import getpass

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es_cloud_id = getpass.getpass("Enter Cloud ID")
es_pwd = getpass.getpass("Enter Cloud password")


# Initialize the Elasticsearch client
es = Elasticsearch(
    cloud_id=es_cloud_id,
    basic_auth=("elastic", es_pwd),
    request_timeout=600
)
es.info().body

{'name': 'instance-0000000005',
 'cluster_name': 'ae84f52516e54cba9da869c3846ca0c6',
 'cluster_uuid': 'vZednNOIQT-ZJpZKbFq2eA',
 'version': {'number': '8.12.2',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '48a287ab9497e852de30327444b0809e55d46466',
  'build_date': '2024-02-19T10:04:32.774273190Z',
  'build_snapshot': False,
  'lucene_version': '9.9.2',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

### Ingest extracted keywords into Elasticsearch for analysis

In [None]:
file = open('/Data/Airbnb/LA_Dec_2023/keyphrases.txt', 'r')
Lines = file.readlines()

def generator():
  count = 0
  for line in Lines:
    count += 1
    line = line.strip()
    if(len(line) > 0):
      if(line.find(':')!= -1):
        line = line.split(':', 1)[1]
   
      # print(clean_string)
      phrases = line.split(',')

      for phrase in phrases:
        phrase = phrase.strip()
        yield {
          "_index": "extracted_key_phrases",
          "phrase": phrase
        }
        
file.close

try:
  res = bulk(es, generator())
  print("Response: ", res)
except Exception as e:
  print(e)

### Ingest Airbnb listings and tag them with the extract keywords using a zero shot classifiction model.

Ingest listings and tag them with extracted keywords using zero shot classification. The model we are using is `typeform__distilbert-base-uncased-mnli`.

The follow section loads the model into Elasticsearch using [eland](https://github.com/elastic/eland)

In [None]:
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel

import logging
import tempfile

logging.basicConfig(format='%(asctime)s %(levelname)s : %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

MODEL_HUB_URL = "https://huggingface.co"

def load_model(model_id, task_type):
  with tempfile.TemporaryDirectory() as tmp_dir:
    logger.info(f"Loading HuggingFace transformer tokenizer and model [{model_id}] for task [{task_type}]" )

    tm = TransformerModel(model_id=model_id, task_type=task_type)
    model_path, config, vocab_path = tm.save(tmp_dir)

    ptm = PyTorchModel(es, tm.elasticsearch_model_id())
    model_exists = es.options(ignore_status=404).ml.get_trained_models(model_id=ptm.model_id).meta.status == 200

    if model_exists:
      logger.info("Model has already been imported")
    else:
      logger.info("Importing model")
      ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
      logger.info("Starting model deployment")
      ptm.start()
      logger.info(f"Model successfully imported with id '{ptm.model_id}'")
load_model("typeform/distilbert-base-uncased-mnli", "zero_shot_classification")

# fetch it so we can see how it loaded
es.ml.get_trained_models(model_id="typeform__distilbert-base-uncased-mnli").body



Ingest listings using ingest pipeline into Elasticsearch

In [None]:

df_listings = pd.read_json('/Users/sherryger/Documents/Data/Airbnb/LA_Dec_2023/listings-small-20240216.ndjson', lines=True)
df_listings = df_listings.dropna()  # drop null value rows

def generator():
    for index, row in df_listings.iterrows():
        if(index > 0):
          if(len(row["neighborhood_overview"]) > 0):
            price = row['price']
            price = price.replace('$', '').replace(',', '')
            new_price = float(price)
            
            yield {
              "_index": "airbnb_listings_zero_shot",
              "pipeline": "airbnb-reviews-zero-class",
              "review_scores_communication": row["review_scores_communication"],
              "scrape_id": row["scrape_id"],
              "picture_url": row["picture_url"],
              "number_of_reviews_l30d": row["number_of_reviews_l30d"],
              "source": row["source"],
              "host_since": row["host_since"],
              "availability_365": row["availability_365"],
              "number_of_reviews_ltm": row["number_of_reviews_ltm"],
              "first_review": row["first_review"],
              "calculated_host_listings_count_shared_rooms": row["calculated_host_listings_count_shared_rooms"],
              "description": row["description"],
              "bathrooms": row["bathrooms"],
              "bathrooms_text": row["bathrooms_text"],
              "maximum_maximum_nights": row["maximum_maximum_nights"],
              "availability_90": row["availability_90"],
              "host_about": row["host_about"],
              "room_type": row["room_type"],
              "last_scraped": row["last_scraped"],
              "property_type": row["property_type"],
              "beds": row["beds"],
              "maximum_minimum_nights": row["maximum_minimum_nights"],
              "calculated_host_listings_count_private_rooms": row["calculated_host_listings_count_private_rooms"],
              "calculated_host_listings_count": row["calculated_host_listings_count"],
              "name": row["name"],
              "host_listings_count": row["host_listings_count"],
              "maximum_nights_avg_ntm": row["maximum_nights_avg_ntm"],
              "license": row["license"],
              "neighbourhood": row["neighbourhood"],
              "price": new_price,
              "has_availability": row["has_availability"],
              "review_scores_rating": row["review_scores_rating"],
              "review_scores_location": row["review_scores_location"],
              "host_response_time": row["host_response_time"],
              "host_is_superhost": row["host_is_superhost"],
              "host_neighbourhood": row["host_neighbourhood"],
              "neighbourhood_cleansed": row["neighbourhood_cleansed"],
              "listing_url": row["listing_url"],
              "host_has_profile_pic": row["host_has_profile_pic"],
              "review_scores_accuracy": row["review_scores_accuracy"],
              "host_location": row["host_location"],
              "listing_location": {"lat": float(row["latitude"]), "lon": float(row["longitude"])},
              "neighborhood_overview": row["neighborhood_overview"],
              "host_picture_url": row["host_picture_url"],
              "host_total_listings_count": row["host_total_listings_count"],
              "review_scores_checkin": row["review_scores_checkin"],
              "host_response_rate": row["host_response_rate"].strip('%'),
              "host_url": row["host_url"],
              "neighbourhood_group_cleansed": row["neighbourhood_group_cleansed"],
              "bedrooms": row["bedrooms"],
              "maximum_nights": row["maximum_nights"],
              "availability_60": row["availability_60"],
              "calculated_host_listings_count_entire_homes": row["calculated_host_listings_count_entire_homes"],
              "reviews_per_month": row["reviews_per_month"],
              "host_id": row["host_id"],
              "host_name": row["host_name"],
              "accommodates": row["accommodates"],
              "availability_30": row["availability_30"],
              "instant_bookable": row["instant_bookable"],
              "minimum_nights": row["minimum_nights"],
              "calendar_updated": row["calendar_updated"],
              "calendar_last_scraped": row["calendar_last_scraped"],
              "number_of_reviews": row["number_of_reviews"],
              "last_review": row["last_review"],
              "review_scores_value": row["review_scores_value"],
              "host_acceptance_rate": row["host_acceptance_rate"],
              "host_thumbnail_url": row["host_thumbnail_url"],
              "host_identity_verified": row["host_identity_verified"],
              "amenities": row["amenities"],
              "minimum_nights_avg_ntm": row["minimum_nights_avg_ntm"],
              "review_scores_cleanliness": row["review_scores_cleanliness"],
              "id": row["id"],
              "host_verifications": row["host_verifications"],
              "minimum_minimum_nights": row["minimum_minimum_nights"],
              "minimum_maximum_nights": row["minimum_maximum_nights"]
            }

try:
    res = bulk(es, generator())
    print("Response: ", res)
except Exception as e:
    print(e)

### Querying using BM25 with the Mistral extracted keywords

In [None]:
query={
    "bool": {
      "must": [
      {
          "range": {
            "review_scores_cleanliness": {
              "gte": 4.5
            }
          }
        }
      ],
      "filter": [
        {
          "range": {
            "comfortable": {
              "gte": 0.7
            }
          }
        },
        {
          "range": {
            "entertainment": {
              "gte": 0.7
            }
          } 
        },
        {
          "range": {
            "walking distance": {
              "gte": 0.7
            }
          } 
        }
      ],
      "should": [
        {
          "match": {
            "neighborhood_overview": "beach"
          }
        }
      ]
    }
  }

resp = es.search(index="airbnb_listings_elser", query=query, size=5,
  _source=["review_scores_cleanliness", "comfortable", "walking distance", "entertainment", "neighborhood_overview"]
)

for hit in resp['hits']['hits']:
    score = hit['_score']
    overview = hit['_source']['neighborhood_overview']
    walking_distance = hit['_source']['walking distance']
    cleanliness_score = hit['_source']['review_scores_cleanliness']
    comfort_score = hit['_source']['comfortable']
    entertainment_score = hit['_source']['entertainment']
    print(f"Overview: {overview}\nWalking Distance: {walking_distance}\nCleanlinesse: {cleanliness_score}\nComfort: {comfort_score}\nEntertainment: {entertainment_score}\n")

### Semantic search with ELSER

In Elasticsearch, we converted the neighborhood description into sparse vector so we can perform semantic search. ELSER is a language model based on the BERT model. It is a model that is designed to work with out of domain corpus. The follow section demonstrate semantic search on the neighborhood description field.


In [None]:
query_text = input("Enter a question :")
print('\n')

query={
  "text_expansion": {
    "neighborhood_overview_embedding": {
      "model_id": ".elser_model_2",
      "model_text": query_text
    }
  }
}
resp = es.search(index="airbnb_listings_elser", query=query, size=5,
  _source=["review_scores_cleanliness", "comfortable", "walking distance", "entertainment", "neighborhood_overview"]
)

for hit in resp['hits']['hits']:
    score = hit['_score']
    overview = hit['_source']['neighborhood_overview']
    walking_distance = hit['_source']['walking distance']
    cleanliness_score = hit['_source']['review_scores_cleanliness']
    comfort_score = hit['_source']['comfortable']
    entertainment_score = hit['_source']['entertainment']
    print(f"Overview: {overview}\nWalking Distance: {walking_distance}\nCleanlinesse: {cleanliness_score}\nComfort: {comfort_score}\nEntertainment:  {entertainment_score}\n")

### Combining BM25 and Semantic search for more relevant results

In [None]:
query_text = input("Enter a question :")
print('\n')

query={
  "bool": {
      "should": [
        {
          "text_expansion": {
            "neighborhood_overview_embedding": {
              "model_id": ".elser_model_2",
              "model_text": query_text
            }
          }
        },
        {
          "bool": {
            "must": [
              {
                "range": {
                  "review_scores_cleanliness": {
                    "gte": 4.5
                  }
                }
              }
            ],
            "filter": [
              {
                "range": {
                  "comfortable": {
                    "gte": 0.7
                  }
                }
              },
              {
                "range": {
                  "entertainment": {
                    "gte": 0.7
                  }
                }
              },
              {
                "range": {
                  "walking distance": {
                    "gte": 0.7
                  }
                }
              },
              {
                "geo_distance": {
                  "distance": "200m",
                  "listing_location": {
                    "lat": 34.02,
                    "lon": -118.52
                  }
                }
              }
            ]
          }
        }
      ]
    }
  }

resp = es.search(index="airbnb_listings_elser", query=query, size=5,
  _source=["review_scores_cleanliness", "comfortable", "walking distance", "entertainment", "neighborhood_overview", "listing_location"]
)

for hit in resp['hits']['hits']:
    score = hit['_score']
    overview = hit['_source']['neighborhood_overview']
    walking_distance = hit['_source']['walking distance']
    cleanliness_score = hit['_source']['review_scores_cleanliness']
    comfort_score = hit['_source']['comfortable']
    entertainment_score = hit['_source']['entertainment']
    listing_location = hit['_source']['listing_location']
    print(f"Overview: {overview}\nWalking Distance: {walking_distance}\nLocation: {listing_location}\nCleanlinesse: {cleanliness_score}\nComfort: {comfort_score}\nEntertainment: {entertainment_score}\n")