In [45]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

In [46]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")

In [47]:
db = client.airbnb

print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['reviewsm', 'calendar', 'listings_with_reviews_and_cal', 'listings', 'listings_test', 'listings_with_calendar', 'listings3', 'listings_with_reviewsm', 'listings_with_reviews', 'reviews', 'listings_with_reviews_m']


In [48]:
print(db.listings_with_reviews.count_documents({}))
print(db.listings_with_calendar.count_documents({}))


39202
39201


In [49]:
db.listings_with_reviews_and_cal.drop()

pipeline = [
    
        {
                '$lookup': {
                'from': 'listings_with_calendar',
                'localField': 'id',
                'foreignField': '_id',
                'as': 'cal_docs'
                }
        },
        {
                '$unwind': {
                'path': '$cal_docs',
                'preserveNullAndEmptyArrays': True
                }
        },
        {
                '$addFields': {
                'average_price': '$$ROOT.cal_docs.average_price',
                'first_available_date': '$$ROOT.cal_docs.first_available_date',
                'last_available_date': '$$ROOT.cal_docs.last_available_date',
                'dates_list': '$$ROOT.cal_docs.dates_list'
                }
        },
        {
                '$unset': 'cal_docs'
        },
        {
                '$out': 'listings_with_reviews_and_cal'  # Output collection
        }
]

# Run the aggregation pipeline
db.listings_with_reviews_m.aggregate(pipeline)


<pymongo.command_cursor.CommandCursor at 0x14e783490>

In [50]:
for listing in db.listings_with_reviews_and_cal.find().limit(5):
    pprint.pp(listing)

{'_id': ObjectId('66665888dc3e94d5ace49c52'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': '40.59179',
 'longitude': '-73.94285',
 'room_type': 'Private room',
 'price': '30',
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': '2024-01-03',
 'reviews_per_month': '0.86',
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('6666588bdc3e94d5acf43282'),
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'date': '2024-01-03',
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place is very dirty, especially the '
       

In [51]:
def convert_lwrc_to_json(doc):
    doc_new = {}
    for key in ['_id']:
        doc_new[key] = str(doc[key])
    for key in ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 
                'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 
                'minimum_nights', 'number_of_reviews']:
        if pd.isna(doc[key]):
            doc_new[key] = None
        else:
            doc_new[key] = doc[key]
    for key in ['last_review']:
        if pd.isna(doc[key]):
            doc[key] = None
        else:
            if isinstance(doc[key], datetime):
                    doc_new[key] = doc[key].strftime('%Y-%m-%d')
            else:
                    doc_new[key] = doc[key]
            # doc_new[key] = doc[key].strftime('%Y-%m-%d')
    for key in ['reviews_per_month', 'calculated_host_listings_count', 
                'availability_365', 'number_of_reviews_ltm', 'license', 'average_price']:
        if pd.isna(doc[key]):
            doc_new[key] = None
        else:
            doc_new[key] = doc[key]

    for key in ['first_available_date', 'last_available_date']:
        doc_new[key] = doc[key].strftime('%Y-%m-%d')

    dlist = []
    for d in doc['reviews']:
        d_new = {}
        for key in ['_id']:
            d_new[key] = str(d[key])
        for key in ['date']:
            if pd.isna(d[key]):
                d_new[key] = None
            else:
                if isinstance(d[key], datetime):
                    d_new[key] = d[key].strftime('%Y-%m-%d')
                else:
                    d_new[key] = d[key]
                # d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in ['listing_id']:
            if pd.isna(d[key]): 
                d_new[key] = None
            else:
                d_new[key] = d[key]
        for key in ['id']:
            d_new[key] = str(d[key])
        for key in ['reviewer_id', 'reviewer_name', 'comments']:
            if pd.isna(d[key]): 
                d_new[key] = None
            else:
                d_new[key] = d[key]
        
        dlist.append(d_new)
    
    dlist2 = []
    for d in doc['dates_list']:
        d_new = {}
        d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in ['price', 'minimum_nights', 'maximum_nights', 'available']:
            d_new[key] = d[key]
        dlist2.append(d_new)
    doc_new['reviews'] = dlist
    doc_new['dates_list'] = dlist2
    return doc_new

In [52]:
doc = db.listings_with_reviews_and_cal.find_one()
pprint.pp(convert_lwrc_to_json(doc))

{'_id': '66665888dc3e94d5ace49c52',
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': '40.59179',
 'longitude': '-73.94285',
 'room_type': 'Private room',
 'price': '30',
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': '2024-01-03',
 'reviews_per_month': '0.86',
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'average_price': 30.0,
 'first_available_date': '2024-02-06',
 'last_available_date': '2025-02-04',
 'reviews': [{'_id': '6666588bdc3e94d5acf43282',
              'date': '2024-01-03',
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
      

In [53]:
print(db.listings_with_reviews_and_cal.count_documents({}))

cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^1000.*$'}})
    
l = list(cursor)
print(len(l))

39202
43


In [54]:
cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwrc_to_json(doc))

print(len(output))

43


In [55]:
def write_dict_to_dir_json(dict, dir, filename):
    with open(dir + '/' + filename, 'w') as fp:
        json.dump(dict, fp)

dir = '/Users/ruhiaggarwal/Downloads/ASSIGNMENT3'
filename = 'listings_with_reviews_and_cal_subset_1000.json'
write_dict_to_dir_json(output, dir, filename)