# Final project: Step 1
## Data acquisition and data wrangling

In [244]:
#importar librerias
import pandas as pd
pd.set_option('display.max_columns', None)

import re

from tqdm import tqdm

import requests
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


### 1. First source: a public database of cities in the world (.csv)
https://www.kaggle.com/dataset/f66386cd35268fd2ae9c7c03e6e4d93c9b1607265c1adef13f99a76e420be997/version/1

In [4]:
#extraer CSV con base de datos de ciudades
cities = pd.read_csv('../Data/worldcities.csv')

In [5]:
#imprimir una muestra de la tabla
cities.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519


In [6]:
#eliminamos columnas que no nos van a servir
cities.drop(['admin_name', 'capital',"id"], axis=1, inplace=True)

In [7]:
#renombramos las columnas
cities.columns = ['Original name', 'City', 'Lat', 'Lon', 'Country', 'ISO2', 'ISO3', 'Population']

In [17]:
#hay varias ciudades que estan duplicadas, eliminamos los registros repetidos y solo dejamos el primer registro
cities.drop_duplicates(subset=['City', 'Country'], keep='first',inplace=True)

In [18]:
#imprimir una muestra de la tabla
cities.head()

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,35676000.0
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354922.0
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,19028000.0
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,18978000.0
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,18845000.0


### 2. Second source: a public database of prices by city (web scrapping)
https://www.numbeo.com/cost-of-living/prices_by_city.jsp?displayCurrency=USD&itemId=118&itemId=15&itemId=11&itemId=13&itemId=1

In [20]:
#hacemos el request de la segunda fuente de datos (vamos a  sacar los datos con web srapping)
url = 'https://www.numbeo.com/cost-of-living/prices_by_city.jsp?displayCurrency=USD&itemId=118&itemId=15&itemId=11&itemId=13&itemId=1'
resp = requests.get(url)
sopa = bs(resp.content, "html.parser")

In [21]:
#llamamos a la tabla de la página
table = sopa.find("table",{"id":"t2"})

In [22]:
#sacamos cada fila de la tabla
filas = table.findAll("tr")

In [23]:
#eliminamos la primera fila (son los titulos de columna)
filas.pop(0)

<tr>
<th><div style="font-size: 80%; vertical-align: middle;">Rank</div></th>
<th><div class="font_in_table_headers">City</div></th><th><div class="font_in_table_headers">Meal, Inexpensive Restaurant</div></th><th><div class="font_in_table_headers">Eggs <br/>(regular) <br/>(12)</div></th><th><div class="font_in_table_headers">Water <br/>(1.5 liter bottle)</div></th><th><div class="font_in_table_headers">Domestic Beer <br/>(0.5 liter bottle)</div></th><th><div class="font_in_table_headers">Banana <br/>(1kg)</div></th></tr>

In [29]:
#generamos el data frame con la información
numbeo = []

for i in filas:
    city = i.findAll("td")[1].text.split(", ")[0]
    city = re.sub("\(.*\)","",city).strip()
    country = i.findAll("td")[1].text.split(", ")[-1].strip()
    country = re.sub("\(.*\)","",country).strip()
    meal = i.findAll("td")[2].text.strip()
    eggs = i.findAll("td")[3].text.strip()
    water = i.findAll("td")[4].text.strip()
    beer = i.findAll("td")[5].text.strip()
    banana = i.findAll("td")[6].text.strip()
    
    row = {"City": city,"Country": country, "Meal (Inexpensive Restaurant) (USD)": meal,"Eggs (12) (USD)": eggs,"Water (1.5 liter bottle) (USD)": water,"Domestic Beer (USD)": beer,"Banana (1kg) (USD)": banana}
    numbeo.append(row)
    
numbeo_df = pd.DataFrame(numbeo)
numbeo_df.head()

Unnamed: 0,City,Country,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD)
0,Saint Petersburg,Russia,6.56,1.11,0.56,0.86,0.84
1,Samara,Russia,6.62,0.95,0.43,0.71,0.89
2,Algiers,Algeria,3.11,1.15,0.24,1.79,1.88
3,Saratov,Russia,5.29,0.99,0.36,0.82,0.8
4,Banja Luka,Bosnia And Herzegovina,4.85,1.82,0.61,0.64,1.31


### 3. Third source: a public list of daily budget by city for backpackers (web scrapping)
https://www.priceoftravel.com/world-cities-by-price-backpacker-index/

In [30]:
#hacemos el request de la tercera fuente de datos (vamos a  sacar los datos con web srapping)
url = 'https://www.priceoftravel.com/world-cities-by-price-backpacker-index/'
resp = requests.get(url)
sopa = bs(resp.content, "html.parser")

In [31]:
#llamamos a los elementos que nos van a servir del html
lst = sopa.find("div",{"class":"bpiidx_list"}).findAllNext("div",{"id":"bpi_row1"})

In [32]:
#generamos el dataframe con los datos extraidos
index = []

for i in lst:
    city = i.findAll("div",{"class":"bpidx"})[2].text.split(", ")[0]
    city = re.sub("\(.*\)","",city).strip()
    country = i.findAll("div",{"class":"bpidx"})[2].text.split(", ")[-1].strip()
    budget = i.findAll("div",{"class":"bpidx"})[1].text
    
    row = {"City": city,"Country": country, "Daily Budget": budget}
    index.append(row)
    
index_df = pd.DataFrame(index)
index_df.head()

Unnamed: 0,City,Country,Daily Budget
0,Hanoi,Viet Nam,$19.70
1,Saigon,Viet Nam,$20.54
2,Vientiane,Laos,$21.07
3,Hoi An,Viet Nam,$21.48
4,Pokhara,Nepal,$21.71


### 4. Fourth source: Information (a lot) about cities around the world (web scrapping with Selenium)
https://nomadlist.com/

In [34]:
#abrimos el navegador de Selenium
navegador = webdriver.Chrome()

In [35]:
#maximizamos el navegador
navegador.maximize_window()

In [179]:
#vamos al url
navegador.get('https://nomadlist.com/userApi.php?action=login_by_email&hash=91e7c90f5dcbfc27710077645e4f4841b8949523')

In [180]:
#abrimos el menu
menu = navegador.find_element_by_xpath('/html/body/div[3]/div/div[1]/span')
menu.click()

In [181]:
#buscamos el home
home = navegador.find_element_by_xpath('/html/body/div[5]/a[1]')
home.click()

In [51]:
#scroll al final de la página
n_ciudades = len(navegador.find_elements_by_tag_name('li'))

while n_ciudades<1343:
    navegador.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(2)
    n_ciudades = len(navegador.find_elements_by_tag_name('li'))
    print(n_ciudades)

In [274]:
%%time

#lista de ciudades
ciudades = []

for i in tqdm(range(1,3)):
    #seleccionamos una ciudad
    cada_ciudad = navegador.find_element_by_xpath(f'/html/body/div[7]/ul/li[{i}]')
    cada_ciudad.click()
    
    time.sleep(1)
    
    #diccionario de la ciudad
    data_ciudad = {}
    
    #nombre de la ciudad
    data_ciudad['City'] = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[1]/div[7]/h1').text
    data_ciudad['Country'] = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[1]/div[7]/h2/a').text
    data_ciudad['Photo'] = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[1]/img').get_attribute('src')
    
    #sacamos información de la ciudad

    #tab de scores
    nomad_guide = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[4]/ul/h3[1]')
    nomad_guide.click()
    
    time.sleep(1)

    tabla = navegador.find_element_by_xpath('//*[@id="body"]/div[32]/div[3]/div[2]/div[5]/div/div[1]/table/tbody')
    data = tabla.find_elements_by_tag_name('tr')

    datos = []

    for i in data:
        try:
            titulo = i.find_element_by_class_name('key').text
            titulo = re.sub('[^a-zA-Z ]',"",titulo).strip()
        except:
            pass

        lst = ['Overall Score', 'Quality of life score', 'Family score', 'Cost', 'Internet',
               'Fun', 'Safety', 'Education level', 'English speaking', 'People density /km²', 
               'Walkability', 'Peace', 'Traffic safety', 'Hospitals', 'Happiness', 'Nightlife', 
               'Free WiFi in city', 'Places to work from', 'A/C or heating', 
               'Friendly to foreigners', 'Freedom of speech', 'Racial tolerance', 
               'Female friendly', 'LGBTQ friendly', 'Startup Score']
        if titulo in lst:
            contenido = i.find_element_by_class_name('value').text
            if titulo == 'Overall Score':
                contenido = contenido.split('/')[0]
            elif titulo == 'Cost':
                contenido = contenido.split(' ')[2]
                contenido = re.sub('[^0-9. ]',"",contenido)
                titulo = 'Cost/month (USD)'
            elif titulo == 'Internet':
                contenido = contenido.split(' ')[2]
                vel = re.sub('[^A-Za-z ]',"",contenido)
                contenido = re.sub('[^0-9. ]',"",contenido)
                titulo = f'{titulo} {vel}'
            elif i.get_attribute('data-value') != None:
                contenido = i.get_attribute('data-value')

            data_ciudad[titulo] = contenido

    #tab de Nomad
    nomad_guide = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[4]/ul/h3[2]')
    nomad_guide.click()
    
    time.sleep(1)

    tabla = navegador.find_element_by_xpath('//*[@id="body"]/div[32]/div[3]/div[2]/div[5]/div/div[2]/table/tbody')
    data = tabla.find_elements_by_tag_name('tr')

    for i in data:
        try:
            titulo = i.find_element_by_class_name('key').text
            titulo = re.sub('[^a-zA-Z ]',"",titulo).strip()
        except:
            pass

        lst = ['Best taxi app in country','Best wireless carrier','Best wireless carrier','Tipping','Cashless society','Safe tap water','Population']
        if titulo in lst:
            contenido = i.find_element_by_class_name('value').text
            contenido = re.sub('[^a-zA-Z0-9 ]',"",contenido).strip().split(' ')[0]
            data_ciudad[titulo] = contenido
        
    #tab de Weather
    nomad_guide = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[4]/ul/h3[7]')
    nomad_guide.click()
    
    time.sleep(1)

    tabla = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[5]/div/div[7]/div/table/tbody')
    data = tabla.find_elements_by_tag_name('tr')

    meses = data[0].text.split(' ')
    temp = [i.split('°')[0] for i in data[2].text.split(' ')[1:] if '°' in i]

    for i,e in enumerate(meses):
        data_ciudad[f'Temperature (°C) {e}'] = temp[i]
    
    #agregar la información a la lista principal de ciudades
    ciudades.append(data_ciudad)

    #cerrar ciudad
    close = navegador.find_element_by_xpath('//*[@id="body"]/div[33]/div')
    close.click()
    
    time.sleep(1)

100%|██████████| 2/2 [00:19<00:00,  9.76s/it]

CPU times: user 336 ms, sys: 20.3 ms, total: 356 ms
Wall time: 19.5 s





In [275]:
pd.DataFrame(ciudades)

Unnamed: 0,City,Country,Photo,Overall Score,Quality of life score,Family score,Cost/month (USD),Internet Mbps,Fun,Safety,Education level,English speaking,Walkability,Peace,Traffic safety,Hospitals,Happiness,Nightlife,Free WiFi in city,Places to work from,Friendly to foreigners,Freedom of speech,Racial tolerance,Female friendly,LGBTQ friendly,Startup Score,Best taxi app in country,Best wireless carrier,Tipping,Cashless society,Safe tap water,Population,Temperature (°C) Jan,Temperature (°C) Feb,Temperature (°C) Mar,Temperature (°C) Apr,Temperature (°C) May,Temperature (°C) Jun,Temperature (°C) Jul,Temperature (°C) Aug,Temperature (°C) Sep,Temperature (°C) Oct,Temperature (°C) Nov,Temperature (°C) Dec
0,Lisbon,Portugal,https://nomadlist.com/assets/img/places/lisbon...,4.69,4,4,2011,27,4,Great,3,4,5,4,3,2.0,3,3,3,5,4,5,3,4,5,3,Uber,Vodafone,No,Yes,Yes,550000,14,16,19,18,22,21,23,25,24,21,17,16
1,"Canggu, Bali",Indonesia,https://nomadlist.com/assets/img/places/canggu...,4.59,4,4,1245,19,4,Great,3,4,3,3,2,,3,4,4,5,5,3,4,4,4,3,GoJEK,XL,No,No,No,30000,30,30,30,30,30,28,27,27,27,29,30,31


In [182]:
#seleccionamos una ciudad
cada_ciudad = navegador.find_element_by_xpath('//*[@id="body"]/div[7]/ul/li[2]')
cada_ciudad.click()

In [221]:
#lista de ciudades
ciudades = []

#diccionario de la ciudad
data_ciudad = {}

#nombre de la ciudad
data_ciudad['City'] = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[1]/div[7]/h1').text
data_ciudad['Country'] = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[1]/div[7]/h2/a').text
data_ciudad['Photo'] = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[1]/img').get_attribute('src')


In [222]:
#sacamos información de la ciudad

#tab de scores
nomad_guide = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[4]/ul/h3[1]')
nomad_guide.click()

tabla = navegador.find_element_by_xpath('//*[@id="body"]/div[32]/div[3]/div[2]/div[5]/div/div[1]/table/tbody')
data = tabla.find_elements_by_tag_name('tr')

datos = []

for i in data:
    titulo = i.find_element_by_class_name('key').text
    titulo = re.sub('[^a-zA-Z ]',"",titulo).strip()
    
    lst = ['Overall Score', 'Quality of life score', 'Family score', 'Cost', 'Internet',
           'Fun', 'Safety', 'Education level', 'English speaking', 'People density /km²', 
           'Walkability', 'Peace', 'Traffic safety', 'Hospitals', 'Happiness', 'Nightlife', 
           'Free WiFi in city', 'Places to work from', 'A/C or heating', 
           'Friendly to foreigners', 'Freedom of speech', 'Racial tolerance', 
           'Female friendly', 'LGBTQ friendly', 'Startup Score']
    if titulo in lst:
        contenido = i.find_element_by_class_name('value').text
        if titulo == 'Overall Score':
            contenido = contenido.split('/')[0]
        elif titulo == 'Cost':
            contenido = contenido.split(' ')[2]
            contenido = re.sub('[^0-9. ]',"",contenido)
            titulo = 'Cost/month (USD)'
        elif titulo == 'Internet':
            contenido = contenido.split(' ')[2]
            vel = re.sub('[^A-Za-z ]',"",contenido)
            contenido = re.sub('[^0-9. ]',"",contenido)
            titulo = f'{titulo} {vel}'
        elif i.get_attribute('data-value') != None:
            contenido = i.get_attribute('data-value')
            
        data_ciudad[titulo] = contenido

In [224]:
#tab de Nomad
nomad_guide = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[4]/ul/h3[2]')
nomad_guide.click()

tabla = navegador.find_element_by_xpath('//*[@id="body"]/div[32]/div[3]/div[2]/div[5]/div/div[2]/table/tbody')
data = tabla.find_elements_by_tag_name('tr')

for i in data:
    titulo = i.find_element_by_class_name('key').text
    titulo = re.sub('[^a-zA-Z ]',"",titulo).strip()
    
    lst = ['Best taxi app in country','Best wireless carrier','Best wireless carrier','Tipping','Cashless society','Safe tap water','Population']
    if titulo in lst:
        contenido = i.find_element_by_class_name('value').text
        contenido = re.sub('[^a-zA-Z0-9 ]',"",contenido).strip().split(' ')[0]
        data_ciudad[titulo] = contenido

In [266]:
#tab de Weather
nomad_guide = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[4]/ul/h3[7]')
nomad_guide.click()

tabla = navegador.find_element_by_xpath('/html/body/div[32]/div[3]/div[2]/div[5]/div/div[7]/div/table/tbody')
data = tabla.find_elements_by_tag_name('tr')

meses = data[0].text.split(' ')
temp = [i.split('°')[0] for i in data[2].text.split(' ')[1:] if '°' in i]

'''for i,e in enumerate(meses):
    data_ciudad[f'Temperature (°C) {e}'] = temp[i]'''

"for i,e in enumerate(meses):\n    data_ciudad[f'Temperature (°C) {e}'] = temp[i]"

In [267]:
temp

['30', '30', '30', '30', '30', '28', '27', '27', '27', '29', '30', '31']

In [239]:
#cerrar ciudad
close = navegador.find_element_by_xpath('//*[@id="body"]/div[33]/div')
close.click()

In [238]:
data_ciudad

{'City': 'Lisbon',
 'Country': 'Portugal',
 'Photo': 'https://nomadlist.com/assets/img/places/lisbon-portugal-500px.jpg',
 'Overall Score': '4.69',
 'Quality of life score': '4',
 'Family score': '4',
 'Cost/month (USD)': '2011',
 'Internet Mbps': '27',
 'Fun': '4',
 'Safety': 'Great',
 'Education level': '3',
 'English speaking': '4',
 'Walkability': '5',
 'Peace': '4',
 'Traffic safety': '3',
 'Hospitals': '2',
 'Happiness': '3',
 'Nightlife': '3',
 'Free WiFi in city': '3',
 'Places to work from': '5',
 'Friendly to foreigners': '4',
 'Freedom of speech': '5',
 'Racial tolerance': '3',
 'Female friendly': '4',
 'LGBTQ friendly': '5',
 'Startup Score': '3',
 'Best taxi app in country': 'Uber',
 'Best wireless carrier': 'Vodafone',
 'Tipping': 'No',
 'Cashless society': 'Yes',
 'Safe tap water': 'Yes',
 'Population': '550000',
 'Temperature (°C) Jan': '14',
 'Temperature (°C) Feb': '16',
 'Temperature (°C) Mar': '19',
 'Temperature (°C) Apr': '18',
 'Temperature (°C) May': '22',
 'Tem