# 0. Importing Libraries

In [1]:
!pip install bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim
import geopy
from IPython.display import clear_output



# 1. Set up the scraping

In [2]:
# The airbnb page with the cities to scrape
airbnb_neighborhood_url = 'https://www.airbnb.com/locations'

# get the url text
city_page = requests.get(airbnb_neighborhood_url).text

# convert webpage to a beautifulsoup object
soup = BeautifulSoup(city_page, 'html.parser')

# print the html item in pretty form
#print(soup.prettify())

In [3]:
# Inputs needed. The tags change on this site (probably to discourage scraping)

# To get around this, search the previously printed soup and search for the h2 tag used for cities and the div tag used for neighborhoods.
#    The easiest way is to look for "Austin" as a city and "Barton Hills" as a neighborhood

city_tag = '<h2 class="_1tz64lh">'
neighborhood_tag = '<div class="_17ajzb82">'

In [4]:
# Convert the HTML to a list splitting by a tag open
site_html = soup.prettify().split('<')

# Create an empty list for data
mined_data = []

# Iterate through each city and extract each neighborhood
for each_row in site_html:
    if city_tag[1:] in each_row:
        loop_city = each_row.replace(city_tag[1:],'').replace('  ','').replace('\n','')[1:]
    if neighborhood_tag[1:] in each_row:
        loop_neighborhood = each_row.replace(neighborhood_tag[1:],'').replace('  ','').replace('\n','')[:-1]
        mined_data.append(
            [loop_city, loop_neighborhood, loop_neighborhood + ", " + loop_city]
        )

In [5]:

# get the coordinates for each location by iterating through each one
geolocator = geopy.geocoders.Nominatim(user_agent="myGeocoder")

mined_data_len = len(mined_data)
count = 0

# Iterate through each neighborhood
for each_loc in mined_data:
    count +=1
    clear_output(wait=True)
    print("On item {}/{}: {}".format(count, mined_data_len, each_loc[2]))
    
    valid_response = False
    attempt = 0
    while not(valid_response):
        attempt += 1
        try:
            each_loc_coords = geolocator.geocode(each_loc[2])
        except:
            each_loc_coords = None
        if each_loc_coords is not None:
            valid_response  = True
            each_loc_lat    = each_loc_coords.latitude
            each_loc_long   = each_loc_coords.longitude
    
            each_loc.append(each_loc_lat)
            each_loc.append(each_loc_long)
        else:
            print('\tAttempt {}'.format(attempt))
        if attempt >= 5:
            print('\tBreaking and moving on.')
            break
        
    

On item 561/561: Victory Monument, Bangkok


In [6]:
# Used the mined data to create the dataset
neighborhoods_df = pd.DataFrame(mined_data, columns=['city','neighborhood','full_name','latitude','longitude'])

# Drop any records that are null
neighborhoods_df.dropna(inplace=True)

# Check out the first 5 rows
neighborhoods_df.head()

Unnamed: 0,city,neighborhood,full_name,latitude,longitude
0,Austin,Barton Hills,"Barton Hills, Austin",30.251571,-97.784106
1,Austin,Bouldin Creek,"Bouldin Creek, Austin",30.255667,-97.755481
2,Austin,Clarksville,"Clarksville, Austin",30.27768,-97.759807
3,Austin,Dawson,"Dawson, Austin",30.232926,-97.761418
4,Austin,Downtown,"Downtown, Austin",30.268054,-97.744764


In [7]:
# Code cell for the user to input their neighborhood and travel city
valid_city = False
valid_neighborhood = False

while not valid_city:
    clear_output(wait=True)
    input_city = input('Which city did you like travelling to? Valid options are: \n{}\n'.format(', '.join(list(neighborhoods_df['city'].unique()))))

    if input_city in list(neighborhoods_df['city']):
        clear_output(wait=True)
        valid_city = True
        
        while not valid_neighborhood:
            filtered_neighborhoods = list(neighborhoods_df[neighborhoods_df['city'] == input_city]['neighborhood'].unique())
            input_neighborhood = input('\nWhich neighborhood did you like? Valid options are: \n{}\n'.format(', '.join(filtered_neighborhoods)))
            
            if input_neighborhood in filtered_neighborhoods:
                clear_output(wait=True)
                print("\n\nWe'll be using {} in {} as a baseline. Thank you!".format(input_neighborhood,input_city))
                valid_neighborhood = True
            else:
                clear_output(wait=True)
                print("This isn't a valid neighborood, please select another.\n\n")
            
        
    else:
        print("This isn't a valid city, please select another.\n\n")



We'll be using Downtown Brooklyn in New York as a baseline. Thank you!


In [8]:
# Have the user input the city they are travelling to
valid_target = False

while not valid_target:
    clear_output(wait=True)
    input_target_city = input('Which city are you travelling to? Valid options are: \n{}\n'.format(', '.join(list(neighborhoods_df['city'].unique()))))
    
    if input_target_city in list(neighborhoods_df['city']):
        clear_output(wait=True)
        valid_target = True
        print('I see, we liked {}, {} and we are going to {}. Exciting!'.format(input_neighborhood, input_city, input_target_city))
    else:
        clear_output(wait=True)
        print("Sorry, this isn't a valid city. Please select another.\n\n")

I see, we liked Downtown Brooklyn, New York and we are going to San Francisco. Exciting!


In [9]:
# This Cell was hidden by Watson.

In [10]:
# Create a function to return venues.
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=10000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues, url)

In [11]:
# Create a dataframe of just the target city and neighborhoods
target_df = neighborhoods_df.copy()
target_df = target_df[target_df['city'] == input_target_city]

# Create a dataframe with the baseline city.
baseline_df = neighborhoods_df[(neighborhoods_df['city'] == input_city) & (neighborhoods_df['neighborhood'] == input_neighborhood)].copy()

In [12]:
# Get venues for all the neighborhoods in the target city
target_venues, target_urls = getNearbyVenues(
    names = target_df['neighborhood']
    , latitudes = target_df['latitude']
    , longitudes = target_df['longitude']
    , radius = 500
)

# Get the venues for the baseline neighborhood
baseline_venues, baseline_urls = getNearbyVenues(
    names = baseline_df['neighborhood']
    , latitudes = baseline_df['latitude']
    , longitudes = baseline_df['longitude']
    , radius = 500
)

Alamo Square
Bayview
Bernal Heights
Chinatown
Civic Center
Cole Valley
Cow Hollow
Dogpatch
Downtown
Duboce Triangle
Excelsior
Financial District
Fisherman's Wharf
Glen Park
Haight-Ashbury
Hayes Valley
Inner Sunset
Japantown
Lower Haight
Marina
Mission Bay
Mission District
Mission Terrace
Nob Hill
Noe Valley
North Beach
Outer Sunset
Pacific Heights
Parkside
Portola
Potrero Hill
Presidio Heights
Richmond District
Russian Hill
SoMa
South Beach
Telegraph Hill
Tenderloin
The Castro
Twin Peaks
Visitacion Valley
Western Addition
Downtown Brooklyn


In [13]:
# inspect the shape of each venues dataframe
print(target_venues.shape)
print(baseline_venues.shape)

(2347, 7)
(100, 7)


In [14]:
# Determine the number of unique venues
print('Unique Categories:\n\tBaseline:\t{}\n\tTarget:\t\t{}'.format(len(baseline_venues['Venue Category'].unique()),len(target_venues['Venue Category'].unique())))

Unique Categories:
	Baseline:	67
	Target:		305


In [15]:
# See how many similar venues there are
similar_count = 0
similar_venue = []
for each_baseline_venue in list(baseline_venues['Venue Category'].unique()):
    if each_baseline_venue in list(target_venues['Venue Category'].unique()):
        similar_count += 1
        similar_venue.append(each_baseline_venue)
        
print('There are {} similar venues. They are:\n{}'.format(similar_count, ', '.join(similar_venue)))

There are 62 similar venues. They are:
Chinese Restaurant, Burger Joint, Juice Bar, Cuban Restaurant, Plaza, Movie Theater, Mediterranean Restaurant, Bubble Tea Shop, Hotel, History Museum, Cycle Studio, Thai Restaurant, Pizza Place, Food Court, Brewery, Gym, Japanese Restaurant, Farmers Market, Grocery Store, Bagel Shop, Ice Cream Shop, Big Box Store, Shopping Mall, Concert Hall, Sandwich Place, Yoga Studio, Cocktail Bar, Toy / Game Store, Coffee Shop, Seafood Restaurant, Peruvian Restaurant, Wine Shop, Cosmetics Shop, Park, Discount Store, Restaurant, Vietnamese Restaurant, Café, Residential Building (Apartment / Condo), Wine Bar, Creperie, French Restaurant, Gym / Fitness Center, Burrito Place, Health & Beauty Service, Bookstore, Beer Bar, Bank, Mexican Restaurant, Bakery, Middle Eastern Restaurant, Hawaiian Restaurant, Asian Restaurant, Kids Store, Southern / Soul Food Restaurant, BBQ Joint, South American Restaurant, Optical Shop, Dance Studio, Performing Arts Venue, Pakistani Res

In [16]:
# One hot encode both the target and baseline
target_onehot = pd.get_dummies(target_venues[['Venue Category']], prefix="", prefix_sep="")
baseline_onehot = pd.get_dummies(baseline_venues[['Venue Category']], prefix="", prefix_sep="")

# Add the neighborhood field back
target_onehot['Neighborhood'] = target_venues['Neighborhood']
baseline_onehot['Neighborhood'] = baseline_venues['Neighborhood']

In [17]:
# Grouby and normalize the baseline and target dataframes
target_grouped = target_onehot.groupby('Neighborhood').mean().reset_index()
baseline_grouped = baseline_onehot.groupby('Neighborhood').mean().reset_index()

baseline_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bank,Beer Bar,Big Box Store,Bistro,...,Shanghai Restaurant,Shopping Mall,South American Restaurant,Southern / Soul Food Restaurant,Thai Restaurant,Toy / Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Downtown Brooklyn,0.01,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.03,0.03


In [18]:
# Let's look at some of the most common venues!
baseline_grouped.melt(id_vars='Neighborhood', var_name='Venue').sort_values(by='value', ascending=False).rename(columns={'value':'Frequency'}).head()

# Get the most common venues for the target city
num_top_venues = 5

for each_hood in target_grouped['Neighborhood']:
    print("----"+each_hood+"----")
    temp = target_grouped[target_grouped['Neighborhood'] == each_hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alamo Square----
              venue  freq
0               Bar  0.06
1  Sushi Restaurant  0.03
2       Record Shop  0.03
3              Café  0.03
4             Hotel  0.03


----Bayview----
                venue  freq
0  Light Rail Station  0.25
1      Breakfast Spot  0.25
2            Mountain  0.25
3                Park  0.25
4   Accessories Store  0.00


----Bernal Heights----
                venue  freq
0         Coffee Shop  0.05
1  Italian Restaurant  0.04
2              Bakery  0.04
3  Mexican Restaurant  0.04
4          Playground  0.04


----Chinatown----
         venue  freq
0  Coffee Shop  0.12
1        Hotel  0.06
2         Café  0.04
3          Bar  0.04
4       Bakery  0.03


----Civic Center----
                           venue  freq
0                    Coffee Shop  0.07
1                          Hotel  0.06
2          Vietnamese Restaurant  0.06
3                           Café  0.06
4  Vegetarian / Vegan Restaurant  0.04


----Cole Valley----
                   

In [19]:
# check the shape of the target and baseline grouped
print('Baseline Shape:\t{}'.format(baseline_grouped.shape))
print('Target Shape:\t{}'.format(target_grouped.shape))

Baseline Shape:	(1, 68)
Target Shape:	(42, 305)


In [20]:
# Transpose the grouped baseline neighborhood
baseline_grouped_t = baseline_grouped.melt(id_vars='Neighborhood')

# Declare a variable to store the result
similarity_result = []

# Iterate through each neighborhood in the target city and calculate the euclidean distance
for each_target_hood in list(target_grouped['Neighborhood'].unique()):
    temp_hood = target_grouped[target_grouped['Neighborhood'] == each_target_hood].copy()
    
    # Drop each column that doesn't exist in the baseline neighborhood
    for each_target_col in list(temp_hood.columns):
        if each_target_col not in list(baseline_grouped.columns):
            temp_hood.drop(columns=each_target_col, inplace=True)
    
    # Transpose the target neighborhood
    temp_hood_t = temp_hood.melt(id_vars='Neighborhood')
    
    # Merge the target transposed to the baseline transposed
    hood_compare_df = temp_hood_t.merge(baseline_grouped_t, on='variable',how='outer')
    
    # fill NaN values with 0
    hood_compare_df.fillna(0, inplace=True)
    
    # Calculate the euclidean distance by squaring the difference between rows, gathering the sum and squarerooting the sum.
    hood_compare_df['euc_dis'] = (hood_compare_df['value_x']-hood_compare_df['value_y'])**2
    euc_dist = hood_compare_df['euc_dis'].sum()**0.5
    
    similarity_result.append(
        [
            each_target_hood,
            euc_dist
        ]
    )

similarity_df = pd.DataFrame(similarity_result, columns=['neighborhood','euclidean_distance'])

In [21]:
# Get the best neighborhood
recommended_neighborhood = similarity_df.sort_values(by='euclidean_distance')['neighborhood'].values[0]
recommended_score = similarity_df.sort_values(by='euclidean_distance')['euclidean_distance'].values[0]

# Inspect the top 5 most similar neighborhoods.
similarity_df.sort_values(by='euclidean_distance').head(5)

Unnamed: 0,neighborhood,euclidean_distance
34,SoMa,0.105086
2,Bernal Heights,0.110682
9,Duboce Triangle,0.115594
0,Alamo Square,0.115635
15,Hayes Valley,0.117898


In [22]:
# Get the worst neighborhood
worst_neighborhood = similarity_df.sort_values(by='euclidean_distance', ascending=False)['neighborhood'].values[0]
worst_score = similarity_df.sort_values(by='euclidean_distance', ascending=False)['euclidean_distance'].values[0]

# Inspect the top 5 most similar neighborhoods.
similarity_df.sort_values(by='euclidean_distance', ascending=False).head(5)

Unnamed: 0,neighborhood,euclidean_distance
19,Marina,0.510686
22,Mission Terrace,0.416293
40,Visitacion Valley,0.361663
28,Parkside,0.354683
1,Bayview,0.261343


In [23]:
print('Travelling to {}, your best bet is to stay in the "{}" neighborhood with a score of {}.\n'.format(input_target_city, recommended_neighborhood, round(recommended_score,4)))

print('This is based on your experience in "{}", {}.'.format(input_neighborhood, input_city))

Travelling to San Francisco, your best bet is to stay in the "SoMa" neighborhood with a score of 0.1051.

This is based on your experience in "Downtown Brooklyn", New York.


In [24]:
# Print info about the recommended neighborhood
neighborhoods_df[((neighborhoods_df['city'] == input_target_city)|(neighborhoods_df['city'] == input_city)) & ((neighborhoods_df['neighborhood'] == recommended_neighborhood)|(neighborhoods_df['neighborhood'] == input_neighborhood))]

Unnamed: 0,city,neighborhood,full_name,latitude,longitude
54,San Francisco,SoMa,"SoMa, San Francisco",37.780893,-122.400952
419,New York,Downtown Brooklyn,"Downtown Brooklyn, New York",40.692373,-73.986788


In [25]:
baseline_grouped.melt(id_vars='Neighborhood').sort_values(by='value', ascending=False).head()

Unnamed: 0,Neighborhood,variable,value
12,Downtown Brooklyn,Burger Joint,0.05
49,Downtown Brooklyn,Pizza Place,0.05
17,Downtown Brooklyn,Coffee Shop,0.04
34,Downtown Brooklyn,Hotel,0.03
28,Downtown Brooklyn,Grocery Store,0.03
