# Inside Airbnb Australia

In [1]:
import gzip
import os
import shutil
from datetime import datetime

import requests
from dateutil.relativedelta import relativedelta

## Define Date Range

Calculate all the dates since `2012-08-01` (Airbnb launched in Australia) till today in the format `YYYY-MM-DD`.

In [24]:
# set the start_date to the day when Airbnb launched in Australia
start_date = datetime(2012, 8, 1)
end_date = datetime.today()  # last time updated on 2024-08-17

# calculate all the dates between start_date and end_date
dates = []  
while start_date <= end_date:
    dates.append(start_date.strftime('%Y-%m-%d'))
    start_date += relativedelta(days=1)  # set interval to 1 day

print(dates)

['2012-08-01', '2012-08-02', '2012-08-03', '2012-08-04', '2012-08-05', '2012-08-06', '2012-08-07', '2012-08-08', '2012-08-09', '2012-08-10', '2012-08-11', '2012-08-12', '2012-08-13', '2012-08-14', '2012-08-15', '2012-08-16', '2012-08-17', '2012-08-18', '2012-08-19', '2012-08-20', '2012-08-21', '2012-08-22', '2012-08-23', '2012-08-24', '2012-08-25', '2012-08-26', '2012-08-27', '2012-08-28', '2012-08-29', '2012-08-30', '2012-08-31', '2012-09-01', '2012-09-02', '2012-09-03', '2012-09-04', '2012-09-05', '2012-09-06', '2012-09-07', '2012-09-08', '2012-09-09', '2012-09-10', '2012-09-11', '2012-09-12', '2012-09-13', '2012-09-14', '2012-09-15', '2012-09-16', '2012-09-17', '2012-09-18', '2012-09-19', '2012-09-20', '2012-09-21', '2012-09-22', '2012-09-23', '2012-09-24', '2012-09-25', '2012-09-26', '2012-09-27', '2012-09-28', '2012-09-29', '2012-09-30', '2012-10-01', '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05', '2012-10-06', '2012-10-07', '2012-10-08', '2012-10-09', '2012-10-10', '2012

## Define URLs

DO NOT CHANGE THE FOLLOWING CODE BLOCK.

In [2]:
# We want to download from the following example URLs:
# listings.csv.gz: https://data.insideairbnb.com/australia/sa/barossa-valley/2024-03-29/data/listings.csv.gz
# reviews.csv.gz: https://data.insideairbnb.com/australia/sa/barossa-valley/2024-03-29/data/reviews.csv.gz

prefix = 'https://data.insideairbnb.com/australia'

locations = ['sa/barossa-valley', 
             'vic/barwon-south-west-vic', 
             'qld/brisbane',
             'vic/melbourne',
             'nsw/mid-north-coast',
             'vic/mornington-peninsula',
             'nsw/northern-rivers',
             'qld/sunshine-coast',
             'nsw/sydney',
             'tas/tasmania',
             'wa/western-australia']

affix_listings = 'data/listings.csv.gz'
affix_reviews = 'data/reviews.csv.gz'

format = '.csv.gz'

## Download from All Locations from All Dates

In [29]:
save_root = '/Users/yifan/Documents/GitHub/datasets/Inside-Airbnb-Australia/Inside-Airbnb-Australia/all'
if not os.path.exists(save_root):
    os.makedirs(save_root, exist_ok=True)


for location in locations:
    for date in dates:
        for affix in [affix_listings, affix_reviews]:
            url = f'{prefix}/{location}/{date}/{affix}'
            res = requests.get(url)
            # download .csv.gz files
            name = f'{location.split('/')[-1]}_{date}_{affix.split('/')[-1]}'
            with open(os.path.join(save_root, name), 'wb') as f:
                f.write(res.content)
            print(f"Downloaded {name}")
            # try to unzip, delete invalid file if failed
            try: 
                with gzip.open(os.path.join(save_root, name), 'rb') as f_in, \
                    open(os.path.join(save_root, name[:-3]), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
                print(f"Unzipped to {name[:-3]}")
            except gzip.BadGzipFile:
                print(f"Removed invalid {name} and {name[:-3]}")
                os.remove(os.path.join(save_root, name))
                os.remove(os.path.join(save_root, name[:-3]))
            

Downloaded barossa-valley_2012-08-01_listings.csv.gz
Removed invalid barossa-valley_2012-08-01_listings.csv.gz and barossa-valley_2012-08-01_listings.csv
Downloaded barossa-valley_2012-08-01_reviews.csv.gz
Removed invalid barossa-valley_2012-08-01_reviews.csv.gz and barossa-valley_2012-08-01_reviews.csv
Downloaded barossa-valley_2012-08-02_listings.csv.gz
Removed invalid barossa-valley_2012-08-02_listings.csv.gz and barossa-valley_2012-08-02_listings.csv
Downloaded barossa-valley_2012-08-02_reviews.csv.gz
Removed invalid barossa-valley_2012-08-02_reviews.csv.gz and barossa-valley_2012-08-02_reviews.csv
Downloaded barossa-valley_2012-08-03_listings.csv.gz
Removed invalid barossa-valley_2012-08-03_listings.csv.gz and barossa-valley_2012-08-03_listings.csv
Downloaded barossa-valley_2012-08-03_reviews.csv.gz
Removed invalid barossa-valley_2012-08-03_reviews.csv.gz and barossa-valley_2012-08-03_reviews.csv
Downloaded barossa-valley_2012-08-04_listings.csv.gz
Removed invalid barossa-valley_2

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

## Download from One Location from a Date Range

In [5]:
# define your data range
start_date = datetime(2014, 5, 10)
end_date = datetime.today()
dates = []
while start_date <= end_date:
    dates.append(start_date.strftime('%Y-%m-%d'))
    start_date += relativedelta(days=1)  # interval


# define your directory to save the data
save_root = '/Users/yifan/Documents/GitHub/datasets/Inside-Airbnb-Australia/Inside-Airbnb-Australia/barwon-south-west'
# create the directory if it doesn't exist
if not os.path.exists(save_root):
    os.makedirs(save_root, exist_ok=True)


# define the location you want to download the data from
location = 'vic/barwon-south-west-vic'


# start to retrieve the data from the defined location across the defined date range
for date in dates:
    for affix in [affix_listings, affix_reviews]:
        url = f'{prefix}/{location}/{date}/{affix}'
        res = requests.get(url)
        # download .csv.gz files
        name = f'{location.split('/')[-1]}_{date}_{affix.split('/')[-1]}'
        with open(os.path.join(save_root, name), 'wb') as f:
            f.write(res.content)
        print(f"Downloaded {name}")
        # try to unzip, delete invalid file if failed
        try: 
            with gzip.open(os.path.join(save_root, name), 'rb') as f_in, \
                open(os.path.join(save_root, name[:-3]), 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
            print(f"Unzipped to {name[:-3]}")
        except gzip.BadGzipFile:
            print(f"Removed invalid {name} and {name[:-3]}")
            os.remove(os.path.join(save_root, name))
            os.remove(os.path.join(save_root, name[:-3]))

Downloaded barwon-south-west_2014-05-10_listings.csv.gz
Removed invalid barwon-south-west_2014-05-10_listings.csv.gz and barwon-south-west_2014-05-10_listings.csv
Downloaded barwon-south-west_2014-05-10_reviews.csv.gz
Removed invalid barwon-south-west_2014-05-10_reviews.csv.gz and barwon-south-west_2014-05-10_reviews.csv
Downloaded barwon-south-west_2014-05-11_listings.csv.gz
Removed invalid barwon-south-west_2014-05-11_listings.csv.gz and barwon-south-west_2014-05-11_listings.csv
Downloaded barwon-south-west_2014-05-11_reviews.csv.gz
Removed invalid barwon-south-west_2014-05-11_reviews.csv.gz and barwon-south-west_2014-05-11_reviews.csv
Downloaded barwon-south-west_2014-05-12_listings.csv.gz
Removed invalid barwon-south-west_2014-05-12_listings.csv.gz and barwon-south-west_2014-05-12_listings.csv
Downloaded barwon-south-west_2014-05-12_reviews.csv.gz
Removed invalid barwon-south-west_2014-05-12_reviews.csv.gz and barwon-south-west_2014-05-12_reviews.csv
Downloaded barwon-south-west_20

KeyboardInterrupt: 