In [1]:
# import necessary libraries
from pathlib import Path
import requests
import pandas as pd
from io import BytesIO
import gzip
import shutil

def download_one_file(year: int, month: int, day: int) -> pd.DataFrame:
    """
    Downloads calendar data for a given year and month from Inside Airbnb and saves it as a Parquet file.

    Args:
    - year (int): The year of the data to download.
    - month (int): The month of the data to download.
    - day (int): The day of the data to download.

    Returns:
    - df (pd.DataFrame): The DataFrame containing the downloaded data.
    """
    URL = f'http://data.insideairbnb.com/united-states/ny/new-york-city/{year}-{month:02d}-{day:02d}/data/calendar.csv.gz'
    response = requests.get(URL)

    if response.status_code == 200:
        path = f'../data/raw/new-york-city/calendar-{year}-{month:02d}-{day:02d}.parquet'

        with open(path, 'wb') as file:
            file.write(response.content)

        # Decompress the gzipped file and read it with Pandas
        with gzip.open(path, 'rb') as f:
            df = pd.read_csv(f, encoding='utf-8', delimiter=',')
        
        # Save the DataFrame as a Parquet file
        df.to_parquet(path.replace('.parquet', '.csv.parquet'), engine='pyarrow', compression='snappy')

        return df
    else:
        raise Exception(f'{URL} is not available.')


In [4]:
# explore and validate one single file
download_one_file(year=2023, month=8, day=1)

# Specify the path where you want to save the decompressed CSV
file_path = '../data/raw/new-york-city/calendar-2023-09-01.csv.parquet'


# Read the decompressed CSV with Pandas
df = pd.read_parquet(file_path)

df.head(20)

Exception: http://data.insideairbnb.com/united-states/ny/new-york-city/2023-08-01/data/calendar.csv.gz is not available.