In [48]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

In [49]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")

In [50]:
# I have (or will have) a database "airbnb"
db = client.airbnb


print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())
# Note: calendar may not show up yet; it is created only when a first document is inserted into it

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['reviews', 'reviewsm', 'calendar', 'listings_with_reviews_and_cal', 'listings', 'listings_test', 'listings_with_calendar', 'listings3', 'listings_with_reviews_m', 'listings_with_reviewsm', 'listings_with_reviews']


In [51]:
filename_listings = '/Users/ruhiaggarwal/Downloads/listings.csv'
filename_reviews = '/Users/ruhiaggarwal/Downloads/reviews.csv'

dtype1 = {"id":str, "name" : str, "host_id" : str, "host_name" : str, "neighbourhood_group" : str, "neighbourhood" : str, "latitude" : str, "longitude" : str, "room_type" : str, "last_review" : str, "reviews_per_month" : str, "license" : str}
dtype2 = {"listing_id": str, "id" : str, "date": str, "reviewer_id" : str, "reviewer_name" : str, "comments" : str, "datetime" : str, "available": str, 
        "price": str, "adjusted_price": str}

df_listings = pd.read_csv(filename_listings, dtype=dtype1, keep_default_na=False)
df_reviews = pd.read_csv(filename_reviews, dtype=dtype2, keep_default_na=False)

In [52]:
print('The datatypes for the fields of df are:')
print(df_listings.dtypes)

print('\nThe first few rows of df are:')
print(df_listings.head())

The datatypes for the fields of df are:
id                                object
name                              object
host_id                           object
host_name                         object
neighbourhood_group               object
neighbourhood                     object
latitude                          object
longitude                         object
room_type                         object
price                             object
minimum_nights                     int64
number_of_reviews                  int64
last_review                       object
reviews_per_month                 object
calculated_host_listings_count     int64
availability_365                   int64
number_of_reviews_ltm              int64
license                           object
dtype: object

The first few rows of df are:
                   id                                               name  \
0  977395984065981849      Home in Brooklyn · 1 bedroom · 1 bed · 1 bath   
1  729947657876634696  Re

In [53]:
# def convert_date_str_to_datetime(dt):
#     if dt is None:
#         return None
#     elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
#         # print('\nEntered the NaT case\n')
#         return None
#     elif dt != dt:
#         return None        # could also use math.nan, I think
#     elif dt == '':
#         return None
#     else:
#         year = int(dt[0:4])
#         month = int(dt[5:7])
#         day = int(dt[8:10])
#         # print(year, month, day)
#         temp = datetime(year, month, day)
#         ts = temp.timestamp()
#         new_dt = datetime.fromtimestamp(ts)
#         return new_dt

# print(convert_date_str_to_datetime('2024-05-23'))


In [54]:
# df_listings['last_review'] = df_listings['last_review'].apply(convert_date_str_to_datetime)
# df_reviewsm['date'] = df_reviewsm['date'].apply(convert_date_str_to_datetime)

print(df_listings.head())
print(df_reviews.head())

                   id                                               name  \
0  977395984065981849      Home in Brooklyn · 1 bedroom · 1 bed · 1 bath   
1  729947657876634696  Rental unit in The Bronx · 1 bedroom · 1 bed ·...   
2  648033676238017128  Rental unit in Bronx · ★4.89 · 1 bedroom · 1 b...   
3  623137142536549768  Rental unit in Brooklyn · 1 bedroom · 1 bed · ...   
4  966874541313648251  Rental unit in Brooklyn · 1 bedroom · 1 bed · ...   

     host_id        host_name neighbourhood_group       neighbourhood  \
0   95344065            Derek            Brooklyn      Sheepshead Bay   
1     566660           Markus               Bronx            Longwood   
2  421601513  J Carlos Retals               Bronx         Kingsbridge   
3  106442885              Ava            Brooklyn       East New York   
4  489124684           Briana            Brooklyn  Bedford-Stuyvesant   

    latitude   longitude        room_type price  minimum_nights  \
0   40.59179   -73.94285     Private 

In [55]:
time1 = datetime.now()
dict_listings = df_listings.to_dict('records')
time2 = datetime.now()
print(f'Time to perform this operation was {util.time_diff(time1,time2)} seconds.')

Time to perform this operation was 0.184095 seconds.


In [56]:
time1 = datetime.now()
dict_reviews = df_reviews.to_dict('records')
time2 = datetime.now()
print(f'Time to perform this operation was {util.time_diff(time1,time2)} seconds.')

Time to perform this operation was 2.097445 seconds.


In [57]:
len(dict_listings)


39202

In [58]:
def convert_date_to_datetime(dt):
    if pd.isnull(dt):           # tests whether dt is None, NaN, or DaT (not a date)
        return None
    elif type(dt) == pd._libs.tslibs.nattype.NaTType:  # including this, but see below
        return None
    else:
        temp = datetime(dt.year, dt.month, dt.day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt
    
def convert_date_str_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
        # print('\nEntered the NaT case\n')
        return None
    elif dt != dt:
        return None        # could also use math.nan, I think
    elif dt == '':
        return None
    else:
        year = int(dt[0:4])
        month = int(dt[5:7])
        day = int(dt[8:10])
        # print(year, month, day)
        temp = datetime(year, month, day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

In [59]:
# for i in range(len(dict_listings)):
#     dict_rn = dict_listings[i]
#     dict_rn['last_review'] = convert_date_str_to_datetime(dict_rn['last_review'])

In [60]:
df_listings['last_review'] = df_listings['last_review'].apply(convert_date_str_to_datetime)
df_reviews['date'] = df_reviews['date'].apply(convert_date_str_to_datetime)

In [61]:
for doc in dict_listings:
    if pd.isnull(doc['last_review']): 
        doc['last_review'] = None

for doc in dict_reviews:
    if pd.isnull(doc['date']): 
        doc['date'] = None

In [62]:
# The following empties out listings_with_cal; useful if making a fresh start
db.listings3.drop()

print(len(dict_listings))

time1 = datetime.now()
result = db.listings3.insert_many(dict_listings)
time2 = datetime.now()
print(f'\nTime to perform this operation was {util.time_diff(time1,time2)} seconds.')
# between about 2 and 4 minutes

print(f'\nNumber of docs in db.listings3 is {db.listings3.count_documents({})}')

print()
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(db.listings3.find_one({ '_id': o}))
pprint.pp(outdocs)

39202

Time to perform this operation was 0.599385 seconds.

Number of docs in db.listings3 is 39202

[{'_id': ObjectId('66665888dc3e94d5ace5356f'),
  'id': '1081977194515794377',
  'name': 'Rental unit in New York · ★New · 1 bedroom · 1 bath',
  'host_id': '235205106',
  'host_name': 'Desara',
  'neighbourhood_group': 'Manhattan',
  'neighbourhood': 'Gramercy',
  'latitude': '40.737854',
  'longitude': '-73.984215',
  'room_type': 'Entire home/apt',
  'price': '150',
  'minimum_nights': 30,
  'number_of_reviews': 0,
  'last_review': '',
  'reviews_per_month': '',
  'calculated_host_listings_count': 1,
  'availability_365': 164,
  'number_of_reviews_ltm': 0,
  'license': ''},
 {'_id': ObjectId('66665888dc3e94d5ace53570'),
  'id': '1084121726692781496',
  'name': 'Condo in Brooklyn · ★New · 1 bedroom · 1 bed · 1 bath',
  'host_id': '692628',
  'host_name': 'Max',
  'neighbourhood_group': 'Brooklyn',
  'neighbourhood': 'Bedford-Stuyvesant',
  'latitude': '40.69564',
  'longitude': '-73.9

In [63]:
# The following empties out listings_with_cal; useful if making a fresh start
db.reviews.drop()

print(len(dict_reviews))

time1 = datetime.now()
result = db.reviews.insert_many(dict_reviews)
time2 = datetime.now()
print(f'\nTime to perform this operation was {util.time_diff(time1,time2)} seconds.')
# between about 2 and 4 minutes

print(f'\nNumber of docs in db.reviews is {db.reviews.count_documents({})}')

print()
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(db.reviews.find_one({ '_id': o}))
pprint.pp(outdocs)

986810

Time to perform this operation was 8.217596 seconds.

Number of docs in db.reviews is 986810

[{'_id': ObjectId('6666588bdc3e94d5acf44429'),
  'listing_id': '1066905373347759013',
  'id': '1076164094756359504',
  'date': '2024-01-24',
  'reviewer_id': '84336263',
  'reviewer_name': 'T',
  'comments': 'We had a good stay. Place was clean and in a good, central '
              'location. It was a good value for the price and worked well for '
              'our group of 4 adults.'},
 {'_id': ObjectId('6666588bdc3e94d5acf4442a'),
  'listing_id': '1066905373347759013',
  'id': '1079811744368032164',
  'date': '2024-01-29',
  'reviewer_id': '280434951',
  'reviewer_name': 'Saufia',
  'comments': 'I recently had the pleasure of staying at this property and we '
              'had an amazing experience. The property has been recently '
              "refurbished and cleanliness was remarkable. It's perfect for a "
              "big group or family, as there's ample space for everyone

In [64]:
db.reviews.create_index("listing_id")

'listing_id_1'

In [65]:
db.listings_with_reviews_m.drop()
pipeline = [
    {
      '$lookup':
        {
          "from": "reviews",
          "localField": "id",
          "foreignField": "listing_id",
          "as": "reviews"
        }
   },
    
    {
        '$out': 'listings_with_reviews_m'
    }
]

t1 = datetime.now()
db.listings3.aggregate(pipeline)
time2 = datetime.now()
diff = util.time_diff(time1, time2)

print('\nTime it took was:', format(diff, '.4f'), '.')


Time it took was: 13.9151 .


In [66]:
doc = pprint.pp(db.listings_with_reviews_m.find_one())

{'_id': ObjectId('66665888dc3e94d5ace49c52'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': '40.59179',
 'longitude': '-73.94285',
 'room_type': 'Private room',
 'price': '30',
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': '2024-01-03',
 'reviews_per_month': '0.86',
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': ObjectId('6666588bdc3e94d5acf43282'),
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'date': '2024-01-03',
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place is very dirty, especially the '
       

In [67]:
def convert_lwr_to_json(doc):
    doc_new = {}
    for key in ['_id']:
        doc_new[key] = str(doc[key])
    for key in ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 
                'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 
                'minimum_nights', 'number_of_reviews']:
        if pd.isna(doc[key]):
            doc_new[key] = None
        else:
            doc_new[key] = doc[key]
    for key in ['last_review']:
        if pd.isna(doc[key]):
            doc[key] = None
        else:
            if isinstance(doc[key], datetime):
                    doc_new[key] = doc[key].strftime('%Y-%m-%d')
            else:
                    doc_new[key] = doc[key]
            # doc_new[key] = doc[key].strftime('%Y-%m-%d')
    for key in ['reviews_per_month', 'calculated_host_listings_count', 
                'availability_365', 'number_of_reviews_ltm', 'license']:
        if pd.isna(doc[key]):
            doc_new[key] = None
        else:
            doc_new[key] = doc[key]

    dlist = []
    for d in doc['reviews']:
        d_new = {}
        for key in ['_id']:
            d_new[key] = str(d[key])
        for key in ['date']:
            if pd.isna(d[key]):
                d_new[key] = None
            else:
                if isinstance(d[key], datetime):
                    d_new[key] = d[key].strftime('%Y-%m-%d')
                else:
                    d_new[key] = d[key]
                # d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in ['listing_id']:
            if pd.isna(d[key]): 
                d_new[key] = None
            else:
                d_new[key] = d[key]
        for key in ['id']:
            d_new[key] = str(d[key])
        for key in ['reviewer_id', 'reviewer_name', 'comments']:
            if pd.isna(d[key]): 
                d_new[key] = None
            else:
                d_new[key] = d[key]
        dlist.append(d_new)
    doc_new['reviews'] = dlist
    return doc_new
doc = db.listings_with_reviews_m.find_one()
pprint.pp(convert_lwr_to_json(doc))

{'_id': '66665888dc3e94d5ace49c52',
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': '40.59179',
 'longitude': '-73.94285',
 'room_type': 'Private room',
 'price': '30',
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': '2024-01-03',
 'reviews_per_month': '0.86',
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': '',
 'reviews': [{'_id': '6666588bdc3e94d5acf43282',
              'date': '2024-01-03',
              'listing_id': '977395984065981849',
              'id': '1060927930986644037',
              'reviewer_id': '56179331',
              'reviewer_name': 'Sean',
              'comments': 'I fear it’s kind of a “you get what you pay for” '
                          'situation. The place is very dirty, especially the '
                          '

In [68]:
print(db.listings_with_reviews_m.count_documents({}))

cursor = db.listings_with_reviews_m.find({'id' : {'$regex' : '^1000.*$'}})
    
l = list(cursor)
print(len(l))

39202
43


In [69]:
cursor = db.listings_with_reviews_m.find({'id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwr_to_json(doc))

print(len(output))

43


In [70]:
# Writing dict to a json file into a json file in a subdirectory
# Also putting this function into my util.py
def write_dict_to_dir_json(dict, dir, filename):
    with open(dir + '/' + filename, 'w') as fp:
        json.dump(dict, fp)

dir = '/Users/ruhiaggarwal/Downloads/ASSIGNMENT3'
filename = 'listings_with_reviews_m_subset_1000.json'
write_dict_to_dir_json(output, dir, filename)