In [23]:
# import necessary libraries
from pathlib import Path
import requests
import pandas as pd
from io import BytesIO
import gzip
import shutil

def download_one_file(year: int, month: int, day: int) -> pd.DataFrame:
    """
    Downloads calendar data for a given year and month from Inside Airbnb and saves it as a Parquet file.

    Args:
    - year (int): The year of the data to download.
    - month (int): The month of the data to download.
    - day (int): The day of the data to download.

    Returns:
    - df (pd.DataFrame): The DataFrame containing the downloaded data.
    """ 
    URL = f'http://data.insideairbnb.com/united-states/tx/dallas/{year}-{month:02d}-{day:02d}/data/calendar.csv.gz'
    response = requests.get(URL)

    if response.status_code == 200:
        path = f'../data/raw/dallas-calendar-{year}-{month:02d}-{day:02d}.parquet'

        with open(path, 'wb') as file:
            file.write(response.content)

        # Decompress the gzipped file and read it with Pandas
        with gzip.open(path, 'rb') as f:
            df = pd.read_csv(f, encoding='utf-8', delimiter=',')
        
        # Save the DataFrame as a Parquet file
        df.to_parquet(path.replace('.parquet', '.csv.parquet'), engine='pyarrow', compression='snappy')

        return df
    else:
        raise Exception(f'{URL} is not available.')


In [33]:
# explore and validate one single file
download_one_file(year=2023, month=6, day=12)

# Specify the path where you want to save the decompressed CSV
file_path = '../data/raw/dallas-calendar-2023-06-12.csv.parquet'


# Read the decompressed CSV with Pandas
df = pd.read_parquet(file_path)

df.head(20)

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,776810,2023-06-13,f,$75.00,$75.00,5.0,1125.0
1,776810,2023-06-14,f,$75.00,$75.00,5.0,1125.0
2,776810,2023-06-15,f,$75.00,$75.00,5.0,1125.0
3,776810,2023-06-16,f,$75.00,$75.00,5.0,1125.0
4,776810,2023-06-17,f,$75.00,$75.00,5.0,1125.0
5,776810,2023-06-18,f,$75.00,$75.00,5.0,1125.0
6,776810,2023-06-19,f,$75.00,$75.00,5.0,1125.0
7,776810,2023-06-20,f,$75.00,$75.00,5.0,1125.0
8,776810,2023-06-21,f,$75.00,$75.00,5.0,1125.0
9,776810,2023-06-22,f,$75.00,$75.00,5.0,1125.0
