In [1]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

In [2]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")

In [3]:
db = client.airbnb

print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['reviewsm', 'calendar', 'listings', 'listings_test', 'listings_with_calendar', 'listings3', 'listings_with_reviewsm', 'listings_with_reviews', 'reviews', 'listings_with_reviews_and_cal', 'listings_with_reviews_m']


In [4]:
superlative_words = [
    'astounding', 'amazing', 'awesome', 'excellent', 'exceptional',
    'extraordinary', 'fantastic', 'great', 'magnificent', 'splendid', 'wonderful'
]

super_negative_words = ['aweful', 'horrible', 'terrible']

In [5]:
def time_diff(time1, time2):
    return (time2-time1).total_seconds()

def create_query_condition(words):
    return {"reviews.comments": {"$regex": "|".join(words), "$options": "i"}}  

In [6]:
# 1. Query 5 pos
query_5_pos_condition = create_query_condition(superlative_words)
print(f'Query 5 pos condition: {query_5_pos_condition}')

Query 5 pos condition: {'reviews.comments': {'$regex': 'astounding|amazing|awesome|excellent|exceptional|extraordinary|fantastic|great|magnificent|splendid|wonderful', '$options': 'i'}}


In [7]:
time1 = datetime.now()
result_5_pos = db.listings_with_reviews_and_cal.find(query_5_pos_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {time_diff(time1, time2)} seconds.')

time3 = datetime.now()
listings_5_pos = list(result_5_pos)
time4 = datetime.now()
print(f'The time taken to create the list was {time_diff(time3, time4)} seconds.')

print(f'Number of listings (Query 5 pos): {len(listings_5_pos)}')

The time taken for the selection was 4.4e-05 seconds.
The time taken to create the list was 21.360767 seconds.
Number of listings (Query 5 pos): 25196


In [8]:
query_5_pos_info = {
    "query": "Query 5 pos",
    "count": len(listings_5_pos),
    "selection_time": time_diff(time1, time2),
    "list_creation_time": time_diff(time3, time4),
}

print(query_5_pos_info)

{'query': 'Query 5 pos', 'count': 25196, 'selection_time': 4.4e-05, 'list_creation_time': 21.360767}


In [9]:
# 2. Query 5 neg
query_5_neg_condition = create_query_condition(super_negative_words)

time1 = datetime.now()
result_5_neg = db.listings_with_reviews_and_cal.find(query_5_neg_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {time_diff(time1, time2)} seconds.')

time3 = datetime.now()
listings_5_neg = list(result_5_neg)
time4 = datetime.now()
print(f'The time taken to create the list was {time_diff(time3, time4)} seconds.')

print(f'Number of listings (Query 5 neg): {len(listings_5_neg)}')

The time taken for the selection was 0.0002 seconds.
The time taken to create the list was 9.986688 seconds.
Number of listings (Query 5 neg): 1672


In [10]:
# 3. Query 6 pos

def convert_date_str_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  
        return None
    elif dt != dt:
        return None        
    elif dt == '':
        return None
    else:
        year = int(dt[0:4])
        month = int(dt[5:7])
        day = int(dt[8:10])
        # print(year, month, day)
        temp = datetime(year, month, day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

availability = convert_date_str_to_datetime('2025-02-01')


# Query 6 pos: Extend Query 5 pos by adding two more conditions
query_6_pos_condition = {
    "$and": [
        query_5_pos_condition,
        {"average_price": {"$lte": 200}},
        {"last_available_date": {"$gte": availability}}
    ]
}

time1 = datetime.now()
result_6_pos = db.listings_with_reviews_and_cal.find(query_6_pos_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {time_diff(time1, time2)} seconds.')

time3 = datetime.now()
listings_6_pos = list(result_6_pos)
time4 = datetime.now()
print(f'The time taken to create the list was {time_diff(time3, time4)} seconds.')

print(f'Number of listings (Query 6 pos): {len(listings_6_pos)}')

The time taken for the selection was 0.000156 seconds.
The time taken to create the list was 24.142911 seconds.
Number of listings (Query 6 pos): 19228


In [11]:
query_6_neg_condition = {
    "$and": [
        query_5_neg_condition,
        {"average_price": {"$lte": 200}},
        {"last_available_date": {"$gte": availability}}
    ]
}

time1 = datetime.now()
result_6_neg = db.listings_with_reviews_and_cal.find(query_6_neg_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {time_diff(time1, time2)} seconds.')

time3 = datetime.now()
listings_6_neg = list(result_6_neg)
time4 = datetime.now()
print(f'The time taken to create the list was {time_diff(time3, time4)} seconds.')

print(f'Number of listings (Query 6 neg): {len(listings_6_neg)}')

The time taken for the selection was 0.00012 seconds.
The time taken to create the list was 3.424146 seconds.
Number of listings (Query 6 neg): 1281


In [12]:
# 5. Create index
time1 = datetime.now()
index_name = db.listings_with_reviews_and_cal.create_index({"reviews.comments": "text"})
time2 = datetime.now()
index_creation_time = time_diff(time1, time2)
print(f'Index created: {index_name}')
print(f'Time taken to create index: {index_creation_time} seconds')

Index created: reviews.comments_text
Time taken to create index: 19.758829 seconds


In [13]:
cursor = db.listings_with_reviews_and_cal.index_information()
cursor1 = db.listings_with_reviews_and_cal.list_indexes()

print("\nIndex information:")
for i in cursor:
    print(i)
    print()

print("List of indexes:")
for i in cursor1:
    print(i)
    print()



Index information:
_id_

reviews.comments_text

List of indexes:
SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')])

SON([('v', 2), ('key', SON([('_fts', 'text'), ('_ftsx', 1)])), ('name', 'reviews.comments_text'), ('weights', SON([('reviews.comments', 1)])), ('default_language', 'english'), ('language_override', 'language'), ('textIndexVersion', 3)])



In [14]:
# 6. Query 7 pos
search_string = " ".join(superlative_words)
condition_ind = {'$text': {'$search': search_string}}

In [15]:
time1 = datetime.now()
result_7_pos = db.listings_with_reviews_and_cal.find(condition_ind)
time2 = datetime.now()
print(f'The time taken for the selection was {time_diff(time1, time2)} seconds.')

time3 = datetime.now()
listings_7_pos = list(result_7_pos)
time4 = datetime.now()
print(f'The time taken to create the list was {time_diff(time3, time4)} seconds.')
print(f'Number of listings (Query 7 pos): {len(listings_7_pos)}')

The time taken for the selection was 0.000105 seconds.
The time taken to create the list was 48.490824 seconds.
Number of listings (Query 7 pos): 25197


In [16]:
num_listings_7_pos = len(listings_7_pos)
print(f'Number of listings (Query 7 pos): {num_listings_7_pos}')

query_7_pos_info = {
    "query": "Query 7 pos",
    "count": num_listings_7_pos,
    "selection_time": time_diff(time1, time2),
    "list_creation_time": time_diff(time3, time4)
}

print(query_7_pos_info)

Number of listings (Query 7 pos): 25197
{'query': 'Query 7 pos', 'count': 25197, 'selection_time': 0.000105, 'list_creation_time': 48.490824}


In [17]:
# 7. Query 7 neg
search_string_neg = " ".join(super_negative_words)
condition_ind_neg = {'$text': {'$search': search_string_neg}}

In [18]:
time1 = datetime.now()
result_7_neg = db.listings_with_reviews_and_cal.find(condition_ind_neg)
time2 = datetime.now()
print(f'The time taken for the selection was {time_diff(time1, time2)} seconds.')

time3 = datetime.now()
listings_7_neg = list(result_7_neg)
time4 = datetime.now()
print(f'The time taken to create the list was {time_diff(time3, time4)} seconds.')
print(f'Number of listings (Query 7 neg): {len(listings_7_neg)}')

The time taken for the selection was 0.000123 seconds.
The time taken to create the list was 1.724286 seconds.
Number of listings (Query 7 neg): 1930


In [19]:
num_listings_7_neg = len(listings_7_neg)
print(f'Number of listings (Query 7 neg): {num_listings_7_neg}')

query_7_neg_info = {
    "query": "Query 7 neg",
    "count": num_listings_7_neg,
    "selection_time": time_diff(time1, time2),
    "list_creation_time": time_diff(time3, time4)
}

print(query_7_neg_info)

Number of listings (Query 7 neg): 1930
{'query': 'Query 7 neg', 'count': 1930, 'selection_time': 0.000123, 'list_creation_time': 1.724286}
