# 0. Importing Libraries

In [1]:
!pip install bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim
import geopy



# 1. Set up the scraping

In [2]:
# The airbnb page with the cities to scrape
airbnb_neighborhood_url = 'https://www.airbnb.com/locations'

# get the url text
city_page = requests.get(airbnb_neighborhood_url).text

# convert webpage to a beautifulsoup object
soup = BeautifulSoup(city_page, 'html.parser')

# print the html item in pretty form
#print(soup.prettify())

In [3]:
# Inputs needed. The tags change on this site (probably to discourage scraping)

# To get around this, search the previously printed soup and search for the h2 tag used for cities and the div tag used for neighborhoods.
#    The easiest way is to look for "Austin" as a city and "Barton Hills" as a neighborhood

city_tag = '<h2 class="_1tz64lh">'
neighborhood_tag = '<div class="_17ajzb82">'

In [4]:
# Convert the HTML to a list splitting by a tag open
site_html = soup.prettify().split('<')

# Create an empty list for data
mined_data = []

# Iterate through each city and extract each neighborhood
for each_row in site_html:
    if city_tag[1:] in each_row:
        loop_city = each_row.replace(city_tag[1:],'').replace('  ','').replace('\n','')[1:]
    if neighborhood_tag[1:] in each_row:
        loop_neighborhood = each_row.replace(neighborhood_tag[1:],'').replace('  ','').replace('\n','')[:-1]
        mined_data.append(
            [loop_city, loop_neighborhood, loop_neighborhood + ", " + loop_city]
        )

In [5]:
# get the coordinates for each location by iterating through each one
geolocator = geopy.geocoders.Nominatim(user_agent="myGeocoder")

mined_data_len = len(mined_data)
count = 0

# Iterate through each neighborhood
for each_loc in mined_data:
    count +=1
    print("On item {}/{}: {}".format(count, mined_data_len, each_loc[2]))
    
    valid_response = False
    attempt = 0
    while not(valid_response):
        attempt += 1
        each_loc_coords = geolocator.geocode(each_loc[2])
        if each_loc_coords is not None:
            valid_response  = True
            each_loc_lat    = each_loc_coords.latitude
            each_loc_long   = each_loc_coords.longitude
    
            each_loc.append(each_loc_lat)
            each_loc.append(each_loc_long)
        else:
            print('\tAttempt {}'.format(attempt))
        if attempt >= 5:
            print('\tBreaking and moving on.')
            break
    

On item 1/561: Barton Hills, Austin
On item 2/561: Bouldin Creek, Austin
On item 3/561: Clarksville, Austin
On item 4/561: Dawson, Austin
On item 5/561: Downtown, Austin
On item 6/561: East Downtown, Austin
On item 7/561: East Riverside, Austin
On item 8/561: Galindo, Austin
On item 9/561: Hancock, Austin
On item 10/561: Hyde Park, Austin
On item 11/561: North Loop, Austin
On item 12/561: Old West Austin, Austin
On item 13/561: Parker Lane, Austin
On item 14/561: South Congress, Austin
On item 15/561: South Lamar, Austin
On item 16/561: St. Edwards, Austin
On item 17/561: Travis Heights, Austin
On item 18/561: University of Texas, Austin
	Attempt 1
	Attempt 2
	Attempt 3
	Attempt 4
	Attempt 5
	Breaking and moving on.
On item 19/561: Upper Boggy Creek, Austin
	Attempt 1
	Attempt 2
	Attempt 3
	Attempt 4
	Attempt 5
	Breaking and moving on.
On item 20/561: Zilker, Austin
On item 21/561: Alamo Square, San Francisco
On item 22/561: Bayview, San Francisco
On item 23/561: Bernal Heights, San Fr

In [7]:
# Used the mined data to create the dataset
neighborhoods_df = pd.DataFrame(mined_data, columns=['city','neighborhood','full_name','latitude','longitude'])

# Check out the first 5 rows
neighborhoods_df.head()

Unnamed: 0,city,neighborhood,full_name,latitude,longitude
0,Austin,Barton Hills,"Barton Hills, Austin",30.251571,-97.784106
1,Austin,Bouldin Creek,"Bouldin Creek, Austin",30.255667,-97.755481
2,Austin,Clarksville,"Clarksville, Austin",30.27768,-97.759807
3,Austin,Dawson,"Dawson, Austin",30.232926,-97.761418
4,Austin,Downtown,"Downtown, Austin",30.268054,-97.744764


In [8]:
# Save the file to a CSV
neighborhoods_df.to_csv('neighborhood_data.csv')