### <span style=color:blue> Loading Calendar data from csv into local MongoDB    </span>

In [1]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

<span style=color:blue>Getting mongodb connection set up</span>

In [2]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
# could have written client = MongoClient("localhost", 27017)
#                 or 

<span style=color:blue>Getting access to airbnb database, and setting up collection "cal" to hold the calendar data in mongodb</span>

In [3]:
# I have (or will have) a database "airbnb"
db = client.airbnb


print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())
# Note: calendar may not show up yet; it is created only when a first document is inserted into it

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'company', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['listings', 'listings_test']


<span style=color:blue>Loading contents of calendar csv file into a dataframe</span>

<span style=color:blue>The system will give a warning, but it appears safe to ignore it.</span>

In [4]:
filename = '/Users/ruhiaggarwal/Downloads/calendar.csv'

# Using partial list of dtypes, so that first several fields are interpreted as strings
# As for the date and available fields (intended as date type and boolean, respectively,
#    we import as strings and convert in the data frame
dtype = {"listing_id": str, "date": str, "available": str, 
        "price": str, "adjusted_price": str}
# note including these, because the null values make trouble:  , "minimum_nights": int, "maximum_nights": int}

# the csv has nulls in "adjusted_price", which has type str,. so including keep_default_na=False, 
#    see https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan
         
df = pd.read_csv(filename, dtype=dtype, keep_default_na=False)

  df = pd.read_csv(filename, dtype=dtype, keep_default_na=False)


In [5]:
print('The datatypes for the fields of df are:')
print(df.dtypes)

print('\nThe first few rows of df are:')
print(df.head())

The datatypes for the fields of df are:
listing_id        object
date              object
available         object
price             object
adjusted_price    object
minimum_nights    object
maximum_nights    object
dtype: object

The first few rows of df are:
  listing_id        date available    price adjusted_price minimum_nights  \
0     144087  2024-02-10         t  $259.00                            30   
1     144087  2024-02-11         t  $259.00                            30   
2     144087  2024-02-12         t  $259.00                            30   
3     144087  2024-02-13         t  $259.00                            30   
4     144087  2024-02-14         t  $259.00                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


<span style=color:blue>Function to convert dates into datetimes.  This is useful because MongoDB recognizes datetime types but not date types.  So, will convert all dates into datetimes for insertion into MongoDB (after which we can do date arithmetic).  It also has conditions that turn various kinds of null values into None.  (Note: curiously this works on small dataframes, but left some values of "NaT" when applied on very large dataframes.)</span>

<span style=color:blue>This function had been developed when working with the listings join reviews data, where there were some NULL values.  There are no NULL values in the calendar data.</span>

<span style=color:blue>A better sw engineering practice might be to include this function into the util.py file, but leaving it here to make the notebook more self-contained.</span>

In [6]:
# also converts NaT to None, because MongoDB does not recognize NaT
def convert_date_str_to_datetime(dt):
    if dt is None:
        return None
    elif pd.isnull(dt):  # tests whether dt is the pandas value NaT ("not a time")
        # print('\nEntered the NaT case\n')
        return None
    elif dt != dt:
        return None        # could also use math.nan, I think
    elif dt == '':
        return None
    else:
        year = int(dt[0:4])
        month = int(dt[5:7])
        day = int(dt[8:10])
        # print(year, month, day)
        temp = datetime(year, month, day)
        ts = temp.timestamp()
        new_dt = datetime.fromtimestamp(ts)
        return new_dt

print(convert_date_str_to_datetime('2024-05-23'))


2024-05-23 00:00:00


<span style=color:blue>Function to convert the values of field "available" to booleans </span>

In [7]:
def convert_tf_to_boolean(val):
    if val == 't':
        return True
    elif val == 'f':
        return False
    else:
        return None

print(convert_tf_to_boolean('t'), convert_tf_to_boolean('f'), convert_tf_to_boolean('foo'))


True False None


<span style=color:blue>Cleaning up the values in df, to be more compatible with MongoDB  </span>

In [8]:
df['date'] = df['date'].apply(convert_date_str_to_datetime)
df['available'] = df['available'].apply(convert_tf_to_boolean)

print(df.head())

  listing_id       date  available    price adjusted_price minimum_nights  \
0     144087 2024-02-10       True  $259.00                            30   
1     144087 2024-02-11       True  $259.00                            30   
2     144087 2024-02-12       True  $259.00                            30   
3     144087 2024-02-13       True  $259.00                            30   
4     144087 2024-02-14       True  $259.00                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


<span style=color:blue>Cleaning and changing data type of price column.  (Not bothering with adjusted_price column, which appears to be uniformly NULL.   </span>

In [9]:
# need to strip the leading '$' from the price value, and remove commas
df['price'] = df['price'].apply(lambda x:x.replace('$','').replace(',',''))

# converting price which is string to numeric
df['price'] = pd.to_numeric(df['price']) 

print(type(df.loc[0,'price']))
print(df.head())

<class 'numpy.float64'>
  listing_id       date  available  price adjusted_price minimum_nights  \
0     144087 2024-02-10       True  259.0                            30   
1     144087 2024-02-11       True  259.0                            30   
2     144087 2024-02-12       True  259.0                            30   
3     144087 2024-02-13       True  259.0                            30   
4     144087 2024-02-14       True  259.0                            30   

  maximum_nights  
0            365  
1            365  
2            365  
3            365  
4            365  


### <span style=color:blue>Now working to fill listings_with_cal with the dataframe df.  After that we will use an agg function to produce a collection is an array of listings and for each listing an arracy of dates that it is available,   </span>

<span style=color:blue>In this notebook we work directly with the full calendar data set.  You might want to work with a subset of the data, e.g., by using df_small = df.iloc[0:5000].</span>

<span style=color:blue>First step is to load the df into a dict</span>

In [10]:
time1 = datetime.now()
dict_full = df.to_dict('records')
time2 = datetime.now()
print(f'Time to perform this operation was {util.time_diff(time1,time2)} seconds.')

Time to perform this operation was 25.685484 seconds.


<span style=color:blue>Now loading the dictionary into MongoDB</span>

In [11]:
# The following empties out listings_with_cal; useful if making a fresh start
db.calendar.drop()

print(len(dict_full))

time1 = datetime.now()
result = db.calendar.insert_many(dict_full)
time2 = datetime.now()
print(f'\nTime to perform this operation was {util.time_diff(time1,time2)} seconds.')
# between about 2 and 4 minutes

print(f'\nNumber of docs in db.calendar is {db.calendar.count_documents({})}')

print()
outdocs = []
for o in result.inserted_ids[-5:]:
    outdocs.append(db.calendar.find_one({ '_id': o}))
pprint.pp(outdocs)

14299870

Time to perform this operation was 78.598981 seconds.

Number of docs in db.calendar is 14299870

[{'_id': ObjectId('665f7120b7105ff572c8a7a9'),
  'listing_id': '1081185973600372871',
  'date': datetime.datetime(2025, 1, 31, 0, 0),
  'available': False,
  'price': 160.0,
  'adjusted_price': '',
  'minimum_nights': 30,
  'maximum_nights': 365},
 {'_id': ObjectId('665f7120b7105ff572c8a7aa'),
  'listing_id': '1081185973600372871',
  'date': datetime.datetime(2025, 2, 1, 0, 0),
  'available': False,
  'price': 160.0,
  'adjusted_price': '',
  'minimum_nights': 30,
  'maximum_nights': 365},
 {'_id': ObjectId('665f7120b7105ff572c8a7ab'),
  'listing_id': '1081185973600372871',
  'date': datetime.datetime(2025, 2, 2, 0, 0),
  'available': False,
  'price': 160.0,
  'adjusted_price': '',
  'minimum_nights': 30,
  'maximum_nights': 365},
 {'_id': ObjectId('665f7120b7105ff572c8a7ac'),
  'listing_id': '1081185973600372871',
  'date': datetime.datetime(2025, 2, 3, 0, 0),
  'available': Fa

In [12]:
print(len(dict_full))
print(db.calendar.count_documents({}))

14299870
14299870


In [17]:
# making sure that listings_with_calendar is empty
db.listings_with_calendar.drop()

pipeline = [
    {
        '$group': {
            '_id': '$listing_id',
            'average_price': {'$avg': '$price'},
            'first_available_date': {'$min': '$date'},
            'last_available_date': {'$max': '$date'},
            'dates_list': {
                '$push': {
                    'date': '$date',
                    'available': '$available',
                    'price': '$price',
                    'minimum_nights': '$minimum_nights',
                    'maximum_nights': '$maximum_nights'
                }
            }
        }
    },
#     {
#         '$project': {
#             '_id': 0,
#             'average_price': 1,
#             'first_available_date': 1,
#             'last_available_date': 1,
#             'dates_list': 1
#         }
#     },
    {
        '$out': 'listings_with_calendar'
    }
    
]

time1 = datetime.now()
test1 = db.calendar.aggregate(pipeline)
time2 = datetime.now()
diff = util.time_diff(time1, time2)

print('\nTime it took was:', format(diff, '.4f'), '.')

print(db.list_collection_names())

print("test1:")

# print(len(list(test1)))

print(type(test1))




Time it took was: 24.2618 .
['calendar', 'listings', 'listings_test', 'listings_with_calendar']
test1:
<class 'pymongo.command_cursor.CommandCursor'>


In [18]:
count = db.listings_with_calendar.count_documents({})
print(count)

39201


In [19]:
pprint.pp(db.listings_with_calendar.find_one())

{'_id': '10000070',
 'average_price': 85.0,
 'first_available_date': datetime.datetime(2024, 2, 6, 0, 0),
 'last_available_date': datetime.datetime(2025, 2, 4, 0, 0),
 'dates_list': [{'date': datetime.datetime(2024, 2, 6, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 7, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 8, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 9, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30

<span style=color:blue>One way to fetch the data types of fields of documents in a collection.  (This works for db.listings_with_calendar because all documents in the collection have the same structure    </span>

In [20]:
# Retrieve a single document from the collection
doc = db.listings_with_calendar.find_one()

# Iterate over the keys of the retrieved document and retrieve the datatype of each value
for key in doc.keys():
    print(key, type(doc[key]))
for key in doc['dates_list'][0]:
    print(key, type(doc['dates_list'][0][key]))

_id <class 'str'>
average_price <class 'float'>
first_available_date <class 'datetime.datetime'>
last_available_date <class 'datetime.datetime'>
dates_list <class 'list'>
date <class 'datetime.datetime'>
available <class 'bool'>
price <class 'float'>
minimum_nights <class 'int'>
maximum_nights <class 'int'>


<span style=color:blue>As you may recall from the notebook "Loading Local MongoDB with Listings & Reviews-v02e.ipynb", in general, you cannot fetch documents from MongoDB and write them into json files on your machine.  In the next 2 cells we work to create a function that transforms documents in db.listings_with_calendar into dicts that can be written out to json files.   </span>

In [21]:
doc = db.listings_with_calendar.find_one()
pprint.pp(doc)

{'_id': '10000070',
 'average_price': 85.0,
 'first_available_date': datetime.datetime(2024, 2, 6, 0, 0),
 'last_available_date': datetime.datetime(2025, 2, 4, 0, 0),
 'dates_list': [{'date': datetime.datetime(2024, 2, 6, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 7, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 8, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30},
                {'date': datetime.datetime(2024, 2, 9, 0, 0),
                 'available': True,
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30

In [22]:
def convert_lwc_to_json(doc):
    doc_new = {}
    for key in ['_id', 'average_price']:
        doc_new[key] = doc[key]
    for key in ['first_available_date', 'last_available_date']:
        doc_new[key] = doc[key].strftime('%Y-%m-%d')
    dlist = []
    for d in doc['dates_list']:
        d_new = {}
        d_new['date'] = d['date'].strftime('%Y-%m-%d')
        for key in ['price', 'minimum_nights', 'maximum_nights', 'available']:
            d_new[key] = d[key]
        dlist.append(d_new)
    doc_new['dates_list'] = dlist
    return doc_new

pprint.pp(convert_lwc_to_json(doc))

{'_id': '10000070',
 'average_price': 85.0,
 'first_available_date': '2024-02-06',
 'last_available_date': '2025-02-04',
 'dates_list': [{'date': '2024-02-06',
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30,
                 'available': True},
                {'date': '2024-02-07',
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30,
                 'available': True},
                {'date': '2024-02-08',
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30,
                 'available': True},
                {'date': '2024-02-09',
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum_nights': 30,
                 'available': True},
                {'date': '2024-02-10',
                 'price': 85.0,
                 'minimum_nights': 30,
                 'maximum

<span style=color:blue>Will not fetch a small subset of db.listings_with_calendar, and write those documents into the json file "listings_with_calendar_subset.json".  This will be part of your submission for Programming Assignment 3.    </span>

In [23]:
print(db.listings_with_calendar.count_documents({}))

cursor = db.listings_with_calendar.find({'_id' : {'$regex' : '^1000.*$'}})
    
l = list(cursor)
print(len(l))

39201
43


In [24]:
cursor = db.listings_with_calendar.find({'_id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwc_to_json(doc))

print(len(output))

43


In [27]:
# Writing dict to a json file into a json file in a subdirectory
# Also putting this function into my util.py
def write_dict_to_dir_json(dict, dir, filename):
    with open(dir + '/' + filename, 'w') as fp:
        json.dump(dict, fp)

dir = '/Users/ruhiaggarwal/Downloads'
filename = 'listings_with_calendar_subset_1000.json'
write_dict_to_dir_json(output, dir, filename)

