In [8]:
# import necessary libraries
from pathlib import Path
import requests
import pandas as pd
from io import BytesIO
import gzip
import shutil

def download_one_file(year: int, month: int, day: int) -> pd.DataFrame:
    """
    Downloads calendar data for a given year and month from Inside Airbnb and saves it as a Parquet file.

    Args:
    - year (int): The year of the data to download.
    - month (int): The month of the data to download.
    - day (int): The day of the data to download.

    Returns:
    - df (pd.DataFrame): The DataFrame containing the downloaded data.
    """ 
    URL = f'http://data.insideairbnb.com/united-states/tx/dallas/{year}-{month:02d}-{day:02d}/data/listings.csv.gz'
    response = requests.get(URL)

    if response.status_code == 200:
        path = f'../data/raw/dallas-listings-{year}-{month:02d}-{day:02d}.parquet'

        with open(path, 'wb') as file:
            file.write(response.content)

        # Decompress the gzipped file and read it with Pandas
        with gzip.open(path, 'rb') as f:
            df = pd.read_csv(f, encoding='utf-8', delimiter=',')
        
        # Save the DataFrame as a Parquet file
        df.to_parquet(path.replace('.parquet', '.csv.parquet'), engine='pyarrow', compression='snappy')

        return df
    else:
        raise Exception(f'{URL} is not available.')


In [11]:
# explore and validate one single file
download_one_file(year=2023, month=9, day=12)

df = pd.read_csv('../data/raw/listings-2023-09-12.parquet')
df.head(20)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,61878,Condo in Dallas · ★4.75 · 1 bedroom · 2 beds ·...,300211,Rita,,District 2,32.8169,-96.82018,Entire home/apt,85,30,53,2023-05-29,0.34,1,204,7,
1,776810,Loft in Dallas · ★4.90 · 1 bedroom · 1 bed · 1...,4096626,Eric,,District 2,32.81462,-96.81586,Entire home/apt,75,5,29,2020-03-26,0.22,1,322,0,
2,795703,Condo in Dallas · ★4.84 · 1 bedroom · 1 bed · ...,4191322,Michelle,,District 14,32.80327,-96.80976,Entire home/apt,243,30,70,2022-09-25,0.54,1,249,1,
3,826118,Home in Dallas · ★4.83 · 1 bedroom · 1 bed · 1...,804559,Rod,,District 12,32.98825,-96.78926,Private room,62,2,24,2023-06-12,0.24,3,303,4,
4,826178,Home in Dallas · ★5.0 · 1 bedroom · 2 beds · 1...,804559,Rod,,District 12,32.98621,-96.78857,Private room,40,2,23,2020-01-31,0.19,3,312,0,
5,826201,Home in Dallas · ★4.77 · 1 bedroom · 1 bed · 1...,804559,Rod,,District 12,32.98853,-96.78903,Private room,54,30,31,2023-03-07,0.25,3,118,3,
6,860248,Home in Dallas · ★4.83 · 1 bedroom · 1 bed · 1...,4505460,Judy,,District 9,32.85509,-96.70625,Private room,43,7,71,2023-08-06,0.83,3,162,14,
7,1056850,Rental unit in Dallas · ★4.99 · 1 bedroom · 1 ...,5821283,Kathy,,District 14,32.82181,-96.80816,Entire home/apt,95,2,135,2023-04-11,1.28,3,277,6,
8,1154424,Rental unit in Dallas · ★4.61 · Studio · 1 bed...,6063232,Joan,,District 14,32.818684,-96.790154,Entire home/apt,92,3,92,2022-07-05,0.74,10,215,0,
9,1158488,Guest suite in Dallas · ★4.97 · 1 bedroom · 2 ...,5037857,Marie,,District 14,32.81723,-96.76775,Entire home/apt,92,3,201,2023-08-19,1.61,1,328,9,
