# Imports and setup

In [2]:
# !pip install geopy

import numpy as np
import pandas as pd
from google.colab import drive

from geopy.geocoders import Nominatim

In [3]:
drive = drive.mount('/content/drive/')
data_path = '/content/drive/My Drive/Colab_Data/AirBnb/Italy/Sicily/'

Mounted at /content/drive/


## Load Real Estate Data

In [4]:
re_file_path = data_path + 'derived/re_municipalities.csv'
real_estate_df = pd.read_csv(re_file_path)
real_estate_df.shape

(390, 11)

In [5]:
real_estate_df.head()

Unnamed: 0.1,Unnamed: 0,region,province,city,sale_low,sale_mean,sale_high,rent_low,rent_mean,rent_high,url
0,0,Sicilia,Agrigento,Agrigento,806,945,1262,4.29,5.58,7.11,https://www.immobiliare.it/en/mercato-immobili...
1,1,Sicilia,Agrigento,Alessandria della Rocca,288,376,2407,2.58,4.02,35.53,https://www.immobiliare.it/en/mercato-immobili...
2,2,Sicilia,Agrigento,Aragona,288,634,2407,2.58,2.58,35.53,https://www.immobiliare.it/en/mercato-immobili...
3,3,Sicilia,Agrigento,Bivona,288,300,2407,2.58,4.4,35.53,https://www.immobiliare.it/en/mercato-immobili...
4,4,Sicilia,Agrigento,Burgio,288,288,2407,2.58,4.62,35.53,https://www.immobiliare.it/en/mercato-immobili...


## Load ABNB Listing Data

In [6]:
ld_file_path = data_path + 'detailed_listings.csv'
ld_df_clean = pd.read_csv(ld_file_path, nrows=10)

ld_df_clean.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [7]:
ld_df = ld_df_clean.copy()[['id', 'listing_url', 'latitude', 'longitude']]
ld_df.head()

Unnamed: 0,id,listing_url,latitude,longitude
0,207654,https://www.airbnb.com/rooms/207654,37.61484,15.01867
1,125569,https://www.airbnb.com/rooms/125569,37.61566,15.01782
2,702537,https://www.airbnb.com/rooms/702537,37.57494,12.7479
3,230912,https://www.airbnb.com/rooms/230912,38.17447,12.7514
4,137342,https://www.airbnb.com/rooms/137342,36.9294,14.62523


# Reverse Geo Lookups

In [8]:
#DBD TODO:  store this data frame so we don't have to keep looking this info up (it won't change)

geolocator = Nominatim(user_agent="specify_your_app_name_here")
abnb_geo_df = pd.DataFrame(columns=['geo_state', 'geo_country_code', 'geo_county', 'geo_postcode', 'geo_road', 'geo_town', 'geo_village', 'geo_city', 'geo_suburb', 'geo_hamlet', 'geo_neighbourhood', 'geo_quarter', 'geo_municipality'])

#what are all the different potential address components?
address_field_list = []

for ind in ld_df.index:

  location = geolocator.reverse(f"{ld_df['latitude'][ind]}, {ld_df['longitude'][ind]}")

  address_dict = location.raw.get('address', {})

  abnb_geo_df.loc[len(abnb_geo_df.index)] = [address_dict.get('state', ''),
                                             address_dict.get('country_code', ''),
                                             address_dict.get('county', ''),
                                             address_dict.get('postcode', ''),
                                             address_dict.get('road', ''),
                                             address_dict.get('town', ''),
                                             address_dict.get('village', ''),
                                             address_dict.get('city', ''),
                                             address_dict.get('suburb', ''),
                                             address_dict.get('hamlet', ''),
                                             address_dict.get('neighbourhood', ''),
                                             address_dict.get('quarter', ''),
                                             address_dict.get('municipality', '')]


  address_dict = location.raw.get('address', {})
  address_field_list.extend(list(address_dict.keys()))
  
if True:
  display(pd.DataFrame(address_field_list).value_counts())



ISO3166-2-lvl4       10
ISO3166-2-lvl6       10
country              10
country_code         10
county               10
postcode             10
state                10
road                  9
village               6
hamlet                3
town                  3
city                  2
house_number          1
isolated_dwelling     1
suburb                1
dtype: int64

## Concatinate DataFrames (Abnb + Reverse Geo)

In [9]:
ld_df2 = pd.concat([ld_df, abnb_geo_df.reindex(ld_df.index)], axis=1)
ld_df2

Unnamed: 0,id,listing_url,latitude,longitude,geo_state,geo_country_code,geo_county,geo_postcode,geo_road,geo_town,geo_village,geo_city,geo_suburb,geo_hamlet,geo_neighbourhood,geo_quarter,geo_municipality
0,207654,https://www.airbnb.com/rooms/207654,37.61484,15.01867,Sicilia,it,Catania,95030,Viale Aldo Moro,,Nicolosi,,,,,,
1,125569,https://www.airbnb.com/rooms/125569,37.61566,15.01782,Sicilia,it,Catania,95030,Viale Aldo Moro,,Nicolosi,,,,,,
2,702537,https://www.airbnb.com/rooms/702537,37.57494,12.7479,Sicilia,it,Trapani,91021,Viale Nicolò Gentile,Campobello di Mazara,,,,Tre Fontane,,,
3,230912,https://www.airbnb.com/rooms/230912,38.17447,12.7514,Sicilia,it,Trapani,91010,Via del Secco,,San Vito Lo Capo,,,,,,
4,137342,https://www.airbnb.com/rooms/137342,36.9294,14.62523,Sicilia,it,Ragusa,97013,Strada provinciale Castiglione-Tresauro,,,Ragusa,,,,,
5,1701136,https://www.airbnb.com/rooms/1701136,38.53584,14.86456,Sicilia,it,Messina,98050,Via Sicilia,,Lingua,,,,,,
6,3676823,https://www.airbnb.com/rooms/3676823,37.49102,14.63291,Sicilia,it,Catania,95040,Via Maugeri,,Carrubbo,,,,,,
7,300103,https://www.airbnb.com/rooms/300103,37.85356,12.47584,Sicilia,it,Trapani,91025,Lungo Mare dello Stagnone,Marsala,Spagnola,,,Ettore Infersa,,,
8,1760493,https://www.airbnb.com/rooms/1760493,38.11921,15.00677,Sicilia,it,Messina,98066,SP129/c,Patti,,,,Moreri Soprani,,,
9,307482,https://www.airbnb.com/rooms/307482,38.20894,13.32471,Sicilia,it,Palermo,90151,,,,Palermo,Mondello,,,,


## Experiment with finding cities from AirBnb -IN- Real Estate

In [10]:
abnb_df = ld_df2.copy()
re_df = real_estate_df.copy()

abnb_df.shape

(10, 17)

### There's many different types of municipalities in Italy.  Find the 'common' name in the geo data for comparing to RE data

In [29]:
abnb_df_path = data_path + 'derived/abnb_geo_df_500.csv'
abnb_df = pd.read_csv(abnb_df_path)
abnb_df.drop(columns=['geo_municipality'], inplace=True)
abnb_df.fillna('', inplace=True)
abnb_df

Unnamed: 0.1,Unnamed: 0,id,listing_url,latitude,longitude,geo_state,geo_country_code,geo_county,geo_postcode,geo_road,geo_town,geo_village,geo_city,geo_suburb,geo_hamlet,geo_neighbourhood,geo_quarter
0,0,207654,https://www.airbnb.com/rooms/207654,37.61484,15.018670,Sicilia,it,Catania,95030.0,Viale Aldo Moro,,Nicolosi,,,,,
1,1,125569,https://www.airbnb.com/rooms/125569,37.61566,15.017820,Sicilia,it,Catania,95030.0,Viale Aldo Moro,,Nicolosi,,,,,
2,2,702537,https://www.airbnb.com/rooms/702537,37.57494,12.747900,Sicilia,it,Trapani,91021.0,Viale Nicolò Gentile,Campobello di Mazara,,,,Tre Fontane,,
3,3,230912,https://www.airbnb.com/rooms/230912,38.17447,12.751400,Sicilia,it,Trapani,91010.0,Via del Secco,,San Vito Lo Capo,,,,,
4,4,137342,https://www.airbnb.com/rooms/137342,36.92940,14.625230,Sicilia,it,Ragusa,97013.0,Strada provinciale Castiglione-Tresauro,,,Ragusa,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,17133097,https://www.airbnb.com/rooms/17133097,37.96262,14.092386,Sicilia,it,Palermo,90013.0,,,Castelbuono,,,,,
496,496,17170553,https://www.airbnb.com/rooms/17170553,38.02537,12.596860,Sicilia,it,Trapani,91019.0,,Valderice,,,,,,Fico
497,497,17172542,https://www.airbnb.com/rooms/17172542,37.06055,15.297710,Sicilia,it,Siracusa,96100.0,Vicolo dell'Olivo,,,Siracusa,,,,Ortigia
498,498,24714346,https://www.airbnb.com/rooms/24714346,37.51382,15.077340,Sicilia,it,Catania,95121.0,Via Gian Battista Impallomeni,,,Catania,Borgo-Sanzio,,Consolazione,


In [30]:
abnb_df['geo_common_name'] = np.nan

for index, row in abnb_df.iterrows():
  print(f'{index}::{row["geo_town"]}::{row["geo_village"]}::{row["geo_city"]}::{row["geo_suburb"]}::{row["geo_hamlet"]}')

  location_name = ''
  if row["geo_city"] != '':
    location_name = row["geo_city"]
  elif row["geo_town"] != '':
    location_name = row["geo_town"]
  elif row["geo_village"] != '':
    location_name = row["geo_village"]
  elif row["geo_hamlet"] != '':
    location_name = row["geo_hamlet"]
  elif row["geo_suburb"] != '':
    location_name = row["geo_suburb"]

  #keep this for later
  abnb_df['geo_common_name'][index] = location_name

  if (len(location_name) > 0):
    city_df = re_df[re_df['city'] == location_name]
  else:
    print(f'location_name: >>{location_name}<<')
    print(f'----------No RE match for: {row}')

  # if index > 500:
  #   print(f'index: {index}')
  #   break

0::::Nicolosi::::::
1::::Nicolosi::::::
2::Campobello di Mazara::::::::Tre Fontane
3::::San Vito Lo Capo::::::
4::::::Ragusa::::
5::::Lingua::::::
6::::Carrubbo::::::
7::Marsala::Spagnola::::::Ettore Infersa
8::Patti::::::::Moreri Soprani
9::::::Palermo::Mondello::
10::Modica::::::Modica Alta::
11::::Poggiofelice::::::
12::::Custonaci::::::Visicari
13::::::Palermo::Molara::
14::::::Palermo::I Circoscrizione::
15::::Montevago::::::
16::Modica::::::Modica Alta::
17::::Bafia::::::Case Nuove
18::::::Catania::Picanello-Ognina-Barriera-Canalicchio::
19::::Gibellina::::::
20::::Chianchitta::::::
21::::Letojanni::::::
22::Modica::::::Modica Sorda::
23::::Petrosino::::::
24::Acireale::Santa Tecla::::::
25::::Torretta::::::
26::Modica::::::Modica Alta::
27::Modica::::::::
28::::San Leone::Agrigento::::
29::Avola::Gallina::::::
30::Noto::::::::
31::Castellammare del Golfo::::::::Villaggio degli Ulivi
32::Acireale::Santa Tecla::::::
33::Sciacca::::::::
34::Marsala::Terrenove Bambina::::::
35::Mars

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


location_name: >><<
----------No RE match for: Unnamed: 0                                            207
id                                                7793530
listing_url          https://www.airbnb.com/rooms/7793530
latitude                                         35.51141
longitude                                         12.5886
geo_state                                         Sicilia
geo_country_code                                       it
geo_county                                      Agrigento
geo_postcode                                      92031.0
geo_road                                 Via dei Depositi
geo_town                                                 
geo_village                                              
geo_city                                                 
geo_suburb                                               
geo_hamlet                                               
geo_neighbourhood                                        
geo_quarter              

In [32]:
abnb_df.head(40)

Unnamed: 0.1,Unnamed: 0,id,listing_url,latitude,longitude,geo_state,geo_country_code,geo_county,geo_postcode,geo_road,geo_town,geo_village,geo_city,geo_suburb,geo_hamlet,geo_neighbourhood,geo_quarter,geo_common_name
0,0,207654,https://www.airbnb.com/rooms/207654,37.61484,15.01867,Sicilia,it,Catania,95030.0,Viale Aldo Moro,,Nicolosi,,,,,,Nicolosi
1,1,125569,https://www.airbnb.com/rooms/125569,37.61566,15.01782,Sicilia,it,Catania,95030.0,Viale Aldo Moro,,Nicolosi,,,,,,Nicolosi
2,2,702537,https://www.airbnb.com/rooms/702537,37.57494,12.7479,Sicilia,it,Trapani,91021.0,Viale Nicolò Gentile,Campobello di Mazara,,,,Tre Fontane,,,Campobello di Mazara
3,3,230912,https://www.airbnb.com/rooms/230912,38.17447,12.7514,Sicilia,it,Trapani,91010.0,Via del Secco,,San Vito Lo Capo,,,,,,San Vito Lo Capo
4,4,137342,https://www.airbnb.com/rooms/137342,36.9294,14.62523,Sicilia,it,Ragusa,97013.0,Strada provinciale Castiglione-Tresauro,,,Ragusa,,,,,Ragusa
5,5,1701136,https://www.airbnb.com/rooms/1701136,38.53584,14.86456,Sicilia,it,Messina,98050.0,Via Sicilia,,Lingua,,,,,,Lingua
6,6,3676823,https://www.airbnb.com/rooms/3676823,37.49102,14.63291,Sicilia,it,Catania,95040.0,Via Maugeri,,Carrubbo,,,,,,Carrubbo
7,7,300103,https://www.airbnb.com/rooms/300103,37.85356,12.47584,Sicilia,it,Trapani,91025.0,Lungo Mare dello Stagnone,Marsala,Spagnola,,,Ettore Infersa,,,Marsala
8,8,1760493,https://www.airbnb.com/rooms/1760493,38.11921,15.00677,Sicilia,it,Messina,98066.0,SP129/c,Patti,,,,Moreri Soprani,,,Patti
9,9,307482,https://www.airbnb.com/rooms/307482,38.20894,13.32471,Sicilia,it,Palermo,90151.0,,,,Palermo,Mondello,,,,Palermo


In [33]:
re_df.head()

Unnamed: 0.1,Unnamed: 0,region,province,city,sale_low,sale_mean,sale_high,rent_low,rent_mean,rent_high,url
0,0,Sicilia,Agrigento,Agrigento,806,945,1262,4.29,5.58,7.11,https://www.immobiliare.it/en/mercato-immobili...
1,1,Sicilia,Agrigento,Alessandria della Rocca,288,376,2407,2.58,4.02,35.53,https://www.immobiliare.it/en/mercato-immobili...
2,2,Sicilia,Agrigento,Aragona,288,634,2407,2.58,2.58,35.53,https://www.immobiliare.it/en/mercato-immobili...
3,3,Sicilia,Agrigento,Bivona,288,300,2407,2.58,4.4,35.53,https://www.immobiliare.it/en/mercato-immobili...
4,4,Sicilia,Agrigento,Burgio,288,288,2407,2.58,4.62,35.53,https://www.immobiliare.it/en/mercato-immobili...


In [34]:
#save the DF so we don't have to keep doing geo lookups
file_path = data_path + f'derived/abnb_geo_df_{len(abnb_df)}.csv'
abnb_df.to_csv(file_path)

In [15]:
# field_list

In [16]:
ld_df

Unnamed: 0,id,listing_url,latitude,longitude
0,207654,https://www.airbnb.com/rooms/207654,37.61484,15.01867
1,125569,https://www.airbnb.com/rooms/125569,37.61566,15.01782
2,702537,https://www.airbnb.com/rooms/702537,37.57494,12.7479
3,230912,https://www.airbnb.com/rooms/230912,38.17447,12.7514
4,137342,https://www.airbnb.com/rooms/137342,36.9294,14.62523
5,1701136,https://www.airbnb.com/rooms/1701136,38.53584,14.86456
6,3676823,https://www.airbnb.com/rooms/3676823,37.49102,14.63291
7,300103,https://www.airbnb.com/rooms/300103,37.85356,12.47584
8,1760493,https://www.airbnb.com/rooms/1760493,38.11921,15.00677
9,307482,https://www.airbnb.com/rooms/307482,38.20894,13.32471
