# Web Project
## The goal of this project is to build a database of cities with usefull information, to plan your next trip (especially for backpackers)

### 1. First source: a public database of cities in the world (.csv)
https://www.kaggle.com/dataset/f66386cd35268fd2ae9c7c03e6e4d93c9b1607265c1adef13f99a76e420be997/version/1

In [1]:
#Importar librerias necesarias
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as bs
import warnings
from fuzzywuzzy import process
import math
import json
from pandas.io.json import json_normalize
import time
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
#extraer CSV con base de datos de ciudades
cities = pd.read_csv('Data/worldcities.csv')

In [3]:
#imprimir una muestra de la tabla
cities.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519


In [4]:
#eliminamos columnas que no nos van a servir
cities.drop(['admin_name', 'capital',"id"], axis=1, inplace=True)

In [5]:
#renombramos las columnas
cities.columns = ['Original name', 'City', 'Lat', 'Lon', 'Country', 'ISO2', 'ISO3', 'Population']

In [6]:
#revisando la tabla, hay varias ciudades que estan repetidas, eliminamos los registros repetidos y solo dejamos el primer registro
cities.drop_duplicates(subset=['City', 'Country'], keep='first',inplace=True)

In [7]:
#Limpiamos los nombres de las columna "City" y "Country"
cities.replace("Czechia", "Czech Republic", inplace=True)
cities.replace("Goteborg", "Gothenburg", inplace=True)
cities.replace("Irakleio", "Heraklion", inplace=True)
cities.replace("Perm'", "Perm", inplace=True)
cities.replace("Kyiv", "Kiev", inplace=True)
cities.replace("Caerdydd", "Cardiff", inplace=True)
cities.replace("Hannover", "Hanover", inplace=True)
cities.replace("'", "", inplace=True)

In [8]:
#imprimir una muestra de la tabla
cities.head()

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,35676000.0
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354922.0
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,19028000.0
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,18978000.0
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,18845000.0


### 2. Second source: a public database of prices by city (web scrapping)
https://www.numbeo.com/cost-of-living/prices_by_city.jsp?displayCurrency=USD&itemId=118&itemId=15&itemId=11&itemId=13&itemId=1

In [9]:
#hacemos el request de la segunda fuente de datos (vamos a  sacar los datos con web srapping)
url = 'https://www.numbeo.com/cost-of-living/prices_by_city.jsp?displayCurrency=USD&itemId=118&itemId=15&itemId=11&itemId=13&itemId=1'
resp = requests.get(url)
sopa = bs(resp.content, "html.parser")

In [10]:
#llamamos a la tabla de la página
table = sopa.find("table",{"id":"t2"})

In [11]:
#sacamos cada fila de la tabla
filas = table.findAll("tr")

In [12]:
#eliminamos la primera fila (son los titulos de columna)
filas.pop(0)

<tr>
<th><div style="font-size: 80%; vertical-align: middle;">Rank</div></th>
<th><div class="font_in_table_headers">City</div></th><th><div class="font_in_table_headers">Meal, Inexpensive Restaurant</div></th><th><div class="font_in_table_headers">Eggs <br/>(regular) <br/>(12)</div></th><th><div class="font_in_table_headers">Water <br/>(1.5 liter bottle)</div></th><th><div class="font_in_table_headers">Domestic Beer <br/>(0.5 liter bottle)</div></th><th><div class="font_in_table_headers">Banana <br/>(1kg)</div></th></tr>

In [13]:
#generamos el data frame con la información
ciudades = []

for i in filas:
    city = i.findAll("td")[1].text.split(", ")[0].replace("Astana","Nur-Sultan").replace("Penang","George Town").replace("Bali","Denpansar")
    city = re.sub("\(.*\)","",city).strip()
    country = i.findAll("td")[1].text.split(", ")[-1].strip()
    country = re.sub("\(.*\)","",country).strip()
    meal = i.findAll("td")[2].text.strip()
    eggs = i.findAll("td")[3].text.strip()
    water = i.findAll("td")[4].text.strip()
    beer = i.findAll("td")[5].text.strip()
    banana = i.findAll("td")[6].text.strip()
    
    row = {"City": city,"Country": country, "Meal (Inexpensive Restaurant) (USD)": meal,"Eggs (12) (USD)": eggs,"Water (1.5 liter bottle) (USD)": water,"Domestic Beer (USD)": beer,"Banana (1kg) (USD)": banana}
    ciudades.append(row)
    
ciudades_df = pd.DataFrame(ciudades)
ciudades_df

Unnamed: 0,City,Country,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD)
0,Saint Petersburg,Russia,6.82,1.14,0.58,0.90,0.89
1,Samara,Russia,6.82,0.96,0.44,0.73,0.92
2,Algiers,Algeria,3.12,1.14,0.24,1.72,1.91
3,Saratov,Russia,5.46,1.02,0.37,0.85,0.82
4,Banja Luka,Bosnia And Herzegovina,4.83,1.79,0.58,0.60,1.32
...,...,...,...,...,...,...,...
419,Lviv,Ukraine,4.37,1.17,0.49,0.77,1.12
420,Novosibirsk,Russia,5.46,0.97,0.48,0.80,1.06
421,Brussels,Belgium,18.28,3.23,1.26,1.65,2.07
422,Surabaya,Indonesia,2.03,1.72,0.38,2.36,1.31


In [14]:
#funciones para determinar las ciudades más parecidas en las dos tablas iniciales y poder hacer  el merge (usando libreria fuzzywuzzy)

def find_best_match(misspelled, correct_names):
    closest, ratio = process.extractOne(misspelled, correct_names)
    return closest 

def closest_name(x):
    no_change = ["Enschede","Gothenburg","Pattaya","Espoo","Mississauga","Aachen","Saint Louis","Breda","Delft","Leiden","Noida","Goa","Ahmedabad","Bangalore","Thane","Vijayawada"]
    if x["City"] in no_change:
        return x["City"]
    else:
        closest = find_best_match(x["City"], list(cities["City"]))
        return closest


In [15]:
#llamar a función de nombres de ciudades parecidos
ciudades_df["Common name"] = ciudades_df.apply(lambda x: closest_name(x), axis = 1)

In [16]:
#Verificar ciudades a las que se le cambio el nombre
ciudades_df["Comp_nombre"] = ciudades_df["City"] == ciudades_df["Common name"]
ciudades_df[ciudades_df["Comp_nombre"] == False]

Unnamed: 0,City,Country,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD),Common name,Comp_nombre
58,Palma de Mallorca,Spain,14.81,2.5,0.65,0.87,1.87,Palma,False
62,Navi Mumbai,India,2.67,0.87,0.42,2.16,0.77,Mumbai,False
122,Mykolaiv,Ukraine,7.31,1.0,0.5,0.56,0.98,Mykolayiv,False
137,Vinnytsia,Ukraine,5.48,1.07,0.53,0.71,1.11,Vinnytsya,False
217,Patras,Greece,8.29,3.99,0.55,1.6,1.64,Patra,False
227,Quebec City,Canada,11.38,2.47,1.64,2.59,1.57,Quebec,False
240,Cebu,Philippines,3.7,1.83,0.72,1.19,1.67,Cebu City,False
281,Tyumen,Russia,8.19,1.0,0.65,0.89,1.08,Tyumen',False
288,Denpansar,Indonesia,1.69,1.7,0.43,1.9,1.7,Denpasar,False
316,Kazan,Russia,5.46,1.05,0.43,0.76,0.87,Kazan',False


In [17]:
#eliminamos las columnas extras y renombramos la que tiene el nombre correcto
ciudades_df.drop(['City', 'Comp_nombre'], axis=1, inplace=True)
ciudades_df.rename(columns={"Common name": "City"}, inplace=True)

In [18]:
#hacemos el merge de la tabla 1 con la tabla 2. Los valores comunes son "Country" y "City"
result = pd.merge(cities, ciudades_df, on=["Country","City"],how="outer")

In [19]:
result

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD)
0,Tokyo,Tokyo,35.6850,139.7514,Japan,JP,JPN,35676000.0,8.03,2.37,1.25,3.02,3.65
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354922.0,21.00,3.46,2.08,3.23,2.86
2,Mexico City,Mexico City,19.4424,-99.1310,Mexico,MX,MEX,19028000.0,6.37,1.60,0.67,1.09,1.02
3,Mumbai,Mumbai,19.0170,72.8570,India,IN,IND,18978000.0,2.67,0.87,0.42,2.16,0.77
4,Mumbai,Mumbai,19.0170,72.8570,India,IN,IND,18978000.0,4.00,0.92,0.38,2.07,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13894,,Goa,,,India,,,,4.00,0.88,0.33,0.75,0.85
13895,,Ahmedabad,,,India,,,,2.00,0.95,0.40,1.92,0.57
13896,,Bangalore,,,India,,,,2.67,0.88,0.40,1.83,0.67
13897,,Thane,,,India,,,,4.00,0.84,0.32,1.80,0.73


In [20]:
#eliminamos en caso que haya alguna ciudad repetidos
result.drop_duplicates(subset=['City', 'Country'], keep='first',inplace=True)

### 3. Third source: a public list of daily budget by city for backpackers (web scrapping)
https://www.priceoftravel.com/world-cities-by-price-backpacker-index/

In [21]:
#hacemos el request de la tercera fuente de datos (vamos a  sacar los datos con web srapping)
url = 'https://www.priceoftravel.com/world-cities-by-price-backpacker-index/'
resp = requests.get(url)
sopa = bs(resp.content, "html.parser")

In [22]:
#llamamos a los elementos que nos van a servir del html
lst = sopa.find("div",{"class":"bpiidx_list"}).findAllNext("div",{"id":"bpi_row1"})

In [23]:
#generamos el dataframe con los datos extraidos
cities_budget = []

for i in lst:
    city = i.findAll("div",{"class":"bpidx"})[2].text.split(", ")[0]
    city = re.sub("\(.*\)","",city).replace("Saigon","Ho Chi Minh City").replace("Kuta","Denpasar").replace("Xian","Xianyang").replace("Roatán Island","Roatan").strip()
    country = i.findAll("div",{"class":"bpidx"})[2].text.split(", ")[-1]
    country = country.replace("Viet Nam","Vietnam").replace("Czechia","Czech Republic").strip()
    country = re.sub(" and "," And ",country)
    budget = i.findAll("div",{"class":"bpidx"})[1].text
    
    row = {"City": city,"Country": country, "Daily Budget": budget}
    cities_budget.append(row)
    
cities_budget_df = pd.DataFrame(cities_budget)
cities_budget_df
    

Unnamed: 0,City,Country,Daily Budget
0,Hanoi,Vietnam,$19.70
1,Ho Chi Minh City,Vietnam,$20.54
2,Vientiane,Laos,$21.15
3,Pokhara,Nepal,$21.32
4,Hoi An,Vietnam,$21.48
...,...,...,...
132,Boston,United States,$110.52
133,Amsterdam,Netherlands,$115.84
134,Venice,Italy,$120.47
135,New York City,United States,$123.58


In [24]:
#cambiamos algunos valores en la tabla "Result"
result.replace("Burma", "Myanmar", inplace=True)
result.replace("Rangoon", "Yangon", inplace=True)
result.replace("Louangphabang", "Luang Prabang", inplace=True)
result.replace("Fes", "Fez", inplace=True)
result.replace("Cesky Krumlov", "Ceske Budejovice", inplace=True)



In [25]:
#funciones para determinar las ciudades más parecidas en la tabla que juntamos previamente y está 3era tabla (usando libreria fuzzywuzzy)

def find_best_match(misspelled, correct_names):
    closest, ratio = process.extractOne(misspelled, correct_names)
    return closest 

def closest_name(x):
    no_change = ["Hoi An","Boracay Island","Santorini","San Pedro","Ibiza"]
    if x["City"] in no_change:
        return x["City"]
    else:
        closest = find_best_match(x["City"], list(result["City"]))
        return closest

In [26]:
#llamar a función de nombres de ciudades parecidos
cities_budget_df["Common name"] = cities_budget_df.apply(lambda x: closest_name(x), axis = 1)

In [27]:
#Verificar ciudades a las que se le cambio el nombre
cities_budget_df["Comp_nombre"] = cities_budget_df["City"] == cities_budget_df["Common name"]
cities_budget_df[cities_budget_df["Comp_nombre"] == False]

Unnamed: 0,City,Country,Daily Budget,Common name,Comp_nombre
16,Zanzibar City,Tanzania,$25.41,Zanzibar,False
54,Antigua,Guatemala,$38.47,Antigua Guatemala,False
61,Cesky Krumlov,Czech Republic,$42.08,Krum,False
72,Tenerife,Spain,$51.79,Santa Cruz de Tenerife,False
102,Luxembourg City,Luxembourg,$78.59,Luxembourg,False
122,Washington D.C.,United States,$89.30,Washington,False
124,Tel Aviv,Israel,$91.06,Tel Aviv-Yafo,False
135,New York City,United States,$123.58,New York,False


In [28]:
#eliminamos las columnas extras y renombramos la que tiene el nombre correcto
cities_budget_df.drop(['City', 'Comp_nombre'], axis=1, inplace=True)
cities_budget_df.rename(columns={"Common name": "City"}, inplace=True)

In [29]:
#juntamos la tabla Result con la nueva tabla con el "Daily Budget"
result = pd.merge(result, cities_budget_df,  on=["City","Country"], how="outer")

In [31]:
#guardamos en una nueva tabla las ciudades de las que tenemos los datos de precios y/o daily budget
travel_cities = result[result["Banana (1kg) (USD)"].notna() | result["Daily Budget"].notna()]

In [32]:
#rellenamos las celdas nulas con "-"
travel_cities.fillna("-", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [33]:
#reseteamos los indices de travel_cities
travel_cities.reset_index(inplace=True,drop=True)

In [34]:
#muestra de nueva tabla travel_cities
travel_cities.head()

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD),Daily Budget
0,Tokyo,Tokyo,35.685,139.751,Japan,JP,JPN,35676000.0,8.03,2.37,1.25,3.02,3.65,$73.13
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354900.0,21.0,3.46,2.08,3.23,2.86,$123.58
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,19028000.0,6.37,1.6,0.67,1.09,1.02,$29.98
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,18978000.0,2.67,0.87,0.42,2.16,0.77,$31.06
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,18845000.0,4.98,1.51,0.48,0.98,0.97,-


### New column with the closest 5 cities in the list to each city

In [35]:
#función para generar una lista con las ciudades más cercanas a cada una según su latitud y longitud

def closest_cities(row,table):
    rad = 6373.0
    closest_d = []
    closest_c = []
    for i in range(len(table)):
        try:
            lat1 = math.radians(row["Lat"])
            lon1 = math.radians(row["Lon"])
            lat2 = math.radians(table.loc[i,"Lat"])
            lon2 = math.radians(table.loc[i,"Lon"])
            dlon = lon2 - lon1
            dlat = lat2 - lat1
            a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
            c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
            distance = int(rad * c)
            closest_d.append(distance)
            closest_c.append(table.loc[i,"City"])
        except:
            pass  
    zipped = zip(closest_c,closest_d)
    sorted_zip = sorted(zipped, key=lambda tup: tup[1])
    cerc = sorted_zip[1:6]
    return dict(cerc)

In [36]:
#generamos la columna de las 5 ciudades más cercanas a cada ciudad
travel_cities["Closest Cities (km)"] = travel_cities.apply(lambda x: closest_cities(x,travel_cities), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  travel_cities["Closest Cities (km)"] = travel_cities.apply(lambda x: closest_cities(x,travel_cities), axis = 1)


In [37]:
#muestra de la tabla travel_cities
travel_cities.head()

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD),Daily Budget,Closest Cities (km)
0,Tokyo,Tokyo,35.685,139.751,Japan,JP,JPN,35676000.0,8.03,2.37,1.25,3.02,3.65,$73.13,"{'Osaka': 403, 'Vladivostok': 1066, 'Qingdao':..."
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354900.0,21.0,3.46,2.08,3.23,2.86,$123.58,"{'Brooklyn': 5, 'Philadelphia': 127, 'Baltimor..."
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,19028000.0,6.37,1.6,0.67,1.09,1.02,$29.98,"{'Puebla': 107, 'Queretaro': 185, 'Guadalajara..."
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,18978000.0,2.67,0.87,0.42,2.16,0.77,$31.06,"{'Pune': 117, 'Vadodara': 367, 'Indore': 516, ..."
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,18845000.0,4.98,1.51,0.48,0.98,0.97,-,"{'Campinas': 87, 'Curitiba': 342, 'Rio de Jane..."


### 4. Fourth source: REST COUNTRIES API for getting countries information (currency name, language, region and subregion)
https://restcountries.eu/

In [38]:
#generamos una lista de paises unicos en la lista
iso_3_countries = list(travel_cities["ISO3"].unique())
print(iso_3_countries)

['JPN', 'USA', 'MEX', 'IND', 'BRA', 'CHN', 'ARG', 'EGY', 'PHL', 'RUS', 'TUR', 'FRA', 'NGA', 'IDN', 'GBR', 'PER', 'COL', 'HKG', 'TWN', 'THA', 'CHL', 'ESP', 'VNM', 'CAN', 'SGP', 'IRQ', 'AUS', 'MMR', 'ZAF', 'DEU', 'DZA', 'ITA', 'GRC', 'MAR', 'ISR', 'KEN', 'PRT', 'POL', 'UKR', 'SEN', 'ECU', 'MYS', 'SYR', 'TUN', 'AUT', 'UZB', 'CUB', 'DOM', 'AZE', 'GHA', 'ROU', 'PRY', 'LBN', 'BLR', 'BEL', 'HUN', 'BOL', 'ZWE', 'URY', 'KHM', 'QAT', 'MOZ', 'UGA', 'NLD', 'ARE', 'NZL', 'CRI', 'PAN', 'SWE', 'CHE', 'KAZ', 'BGR', 'CZE', 'FIN', 'ARM', 'GEO', 'SRB', 'DNK', 'JOR', 'IRL', 'GTM', 'ALB', 'NPL', 'MNG', 'RWA', 'KGZ', 'NOR', 'LAO', 'LVA', 'OMN', 'HRV', 'BIH', 'MDA', 'BHR', 'LTU', 'SVK', 'EST', 'MLT', 'SVN', 'TTO', 'SUR', 'CYP', 'LKA', 'BWA', 'ISL', 'MNE', 'LUX', 'XKS', 'TZA', 'PRI', 'SLV', 'NIC', 'HND', '-']


In [39]:
#para evitar que nos bloquee el api por hacer muchas llamadas, vamos a generar headers diferentes para cada petición
def get_random_ua():
    random_ua = ''
    ua_file = 'agents.txt'
    try:
        with open(ua_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_ua = lines[int(idx)].replace("\n","")
    except Exception as ex:
        print('Exception in random_ua')
        print(str(ex))
    finally:
        return random_ua

In [40]:
#creamos la nueva lista de información de paises
info_countries = []

for i in iso_3_countries:
    headers = {}
    try:
        url = f"https://restcountries.eu/rest/v2/alpha/{i}"
        user_agent = get_random_ua()
        headers = {'user-agent': user_agent}
        response = requests.get(url, headers = headers)

        region = response.json()["region"]
        subregion = response.json()["subregion"]
        languages = [i["name"] for i in response.json()["languages"]]
        currency = response.json()["currencies"][0]["code"]

        info = {"ISO3":i,"Region":region,"Subregion":subregion,"Languages":languages,"Currency Code":currency}

    except:
        info = {"ISO3":i ,"Region":"-","Subregion":"-","Languages":"-","Currency Code":"-"}
        print(i,"-----",headers)
        
    info_countries.append(info)
    time.sleep(1) #evitar hacer muchas peticiones por segundo        

XKS ----- {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.102 Safari/537.36'}
- ----- {'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.2.2; en-US; SM-T111 Build/JDQ39) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.10.5.809 U3/0.8.0 Mobile Safari/534.30'}


In [41]:
#creamos un DataFrame de la lista anterior
info_countries_pd = pd.DataFrame(info_countries)

In [42]:
#muestra de la nueva tabla
info_countries_pd.head()

Unnamed: 0,ISO3,Region,Subregion,Languages,Currency Code
0,JPN,Asia,Eastern Asia,[Japanese],JPY
1,USA,Americas,Northern America,[English],USD
2,MEX,Americas,Central America,[Spanish],MXN
3,IND,Asia,Southern Asia,"[Hindi, English]",INR
4,BRA,Americas,South America,[Portuguese],BRL


In [43]:
#hacemos el merge de la nueva tabla con la tabla de travel_cities
travel_cities = pd.merge(travel_cities, info_countries_pd,  on="ISO3", how="outer")

In [44]:
#muestra de la tabla completa con el merge
travel_cities.head()

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD),Daily Budget,Closest Cities (km),Region,Subregion,Languages,Currency Code
0,Tokyo,Tokyo,35.685,139.751,Japan,JP,JPN,35676000.0,8.03,2.37,1.25,3.02,3.65,$73.13,"{'Osaka': 403, 'Vladivostok': 1066, 'Qingdao':...",Asia,Eastern Asia,[Japanese],JPY
1,Ōsaka,Osaka,34.75,135.46,Japan,JP,JPN,11294000.0,7.55,2.34,1.17,2.23,2.22,-,"{'Tokyo': 403, 'Vladivostok': 981, 'Shanghai':...",Asia,Eastern Asia,[Japanese],JPY
2,New York,New York,40.6943,-73.9249,United States,US,USA,19354900.0,21.0,3.46,2.08,3.23,2.86,$123.58,"{'Brooklyn': 5, 'Philadelphia': 127, 'Baltimor...",Americas,Northern America,[English],USD
3,Los Angeles,Los Angeles,34.1139,-118.407,United States,US,USA,12815500.0,19.0,3.54,2.18,2.33,2.1,$83.72,"{'San Diego': 185, 'Tijuana': 217, 'Las Vegas'...",Americas,Northern America,[English],USD
4,Chicago,Chicago,41.8373,-87.6862,United States,US,USA,8675980.0,15.0,2.29,2.19,2.25,1.54,$99.90,"{'Madison': 196, 'Indianapolis': 263, 'Detroit...",Americas,Northern America,[English],USD


### 5. Fifth source: 2 free APIs for exchange rate (fixer.io and exchangeratesapi.io)
https://fixer.io/documentation

https://exchangeratesapi.io/

In [45]:
#generamos una lista de monedas unicos en la lista
currency_codes = list(travel_cities["Currency Code"].unique())
print(currency_codes)

['JPY', 'USD', 'MXN', 'INR', 'BRL', 'CNY', 'ARS', 'EGP', 'PHP', 'RUB', 'TRY', 'EUR', 'NGN', 'IDR', 'GBP', 'PEN', 'COP', 'HKD', 'TWD', 'THB', 'CLP', 'VND', 'CAD', 'BND', 'IQD', 'AUD', 'MMK', 'ZAR', 'DZD', 'MAD', 'ILS', 'KES', 'PLN', 'UAH', 'XOF', 'MYR', 'SYP', 'TND', 'UZS', 'CUC', 'DOP', 'AZN', 'GHS', 'RON', 'PYG', 'LBP', 'BYN', 'HUF', 'BOB', 'BWP', 'UYU', 'KHR', 'QAR', 'MZN', 'UGX', 'AED', 'NZD', 'CRC', 'PAB', 'SEK', 'CHF', 'KZT', 'BGN', 'CZK', 'AMD', 'GEL', 'RSD', 'DKK', 'JOD', 'GTQ', 'ALL', 'NPR', 'MNT', 'RWF', 'KGS', 'NOK', 'LAK', 'OMR', 'HRK', 'BAM', 'MDL', 'BHD', 'TTD', 'SRD', 'LKR', 'ISK', '-', 'TZS', 'NIO', 'HNL']


In [46]:
#generamos un token en la página de Fixer el cual está en otro archivo oculto
from tokens import fixer_token

In [47]:
#la bse de Fixer es el EURO entonces llamamos al tipo de cambio a EURO - USD, para transformar todos los datos a USD
url = f"http://data.fixer.io/api/latest?access_key={fixer_token}&symbols=USD"
response = requests.get(url)
results = response.json()

In [48]:
usd_rate = results["rates"]["USD"]
print("1 EURO es igual a:",usd_rate,"USD")

1 EURO es igual a: 1.179809 USD


In [49]:
'''vamos a hacer el request primero a Fixer (viene en euros y lo transformamos a USD),
si no encuentra o da algún error, vamos a solicitar la información exchangeratesapi
(viene el tipo de cambio en dolares)'''

ex_rate = []

for i in currency_codes:
    try:
        url = f"http://data.fixer.io/api/latest?access_key={fixer_token}&symbols={i}"
        #user_agent = get_random_ua()
        #headers = {'user-agent': user_agent}
        response = requests.get(url)#, headers = headers)
        rate_eu = response.json()["rates"][i]
        rate_us = rate_eu/usd_rate
        data = {"Currency Code":i,"Exchange rate (2020-08-24), 1 USD =": rate_us}
    except:
        try:
            url = f"https://api.exchangeratesapi.io/latest?base=USD&symbols={i}"
            user_agent = get_random_ua()
            headers = {'user-agent': user_agent}
            response = requests.get(url, headers = headers)
            rate = response.json()["rates"][i]
            data = {"Currency Code":i,"Exchange rate (2020-08-24), 1 USD =": rate}
        except:
            data = {"Currency Code":i,"Exchange rate (2020-08-24), 1 USD =": "-"}
            print(i,"----",headers)
    ex_rate.append(data)
    time.sleep(1) #evitar hacer muchas peticiones por segundo

- ---- {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586'}


In [50]:
#creamos un DataFrame de la lista anterior
ex_rate_pd = pd.DataFrame(ex_rate)

In [51]:
#muestra de la tabla de exchange rates
ex_rate_pd.head()

Unnamed: 0,Currency Code,"Exchange rate (2020-08-24), 1 USD ="
0,JPY,105.874
1,USD,1.0
2,MXN,21.9888
3,INR,74.865
4,BRL,5.6234


In [52]:
#hacemos el merge de la nueva tabla con la tabla de travel_cities
travel_cities = pd.merge(travel_cities, ex_rate_pd,  on="Currency Code", how="outer")

In [53]:
#muestra de la tabla completa con el merge
travel_cities.tail()

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant) (USD),Eggs (12) (USD),Water (1.5 liter bottle) (USD),Domestic Beer (USD),Banana (1kg) (USD),Daily Budget,Closest Cities (km),Region,Subregion,Languages,Currency Code,"Exchange rate (2020-08-24), 1 USD ="
451,-,Interlaken,-,-,Switzerland,-,-,-,-,-,-,-,-,$101.65,{},-,-,-,-,-
452,Zanzibar,Zanzibar,-6.16,39.2,Tanzania,TZ,TZA,403658,-,-,-,-,-,$25.41,"{'Arusha': 419, 'Nairobi': 603, 'Kampala': 102...",Africa,Eastern Africa,"[Swahili, English]",TZS,2320
453,Arusha,Arusha,-3.36,36.67,Tanzania,TZ,TZA,341136,-,-,-,-,-,$30.81,"{'Nairobi': 231, 'Zanzibar': 419, 'Kampala': 6...",Africa,Eastern Africa,"[Swahili, English]",TZS,2320
454,Granada,Granada,11.9337,-85.95,Nicaragua,NI,NIC,105219,-,-,-,-,-,$23.71,"{'San Jose': 301, 'Santa Ana': 453, 'Roatan': ...",Americas,Central America,[Spanish],NIO,34.847
455,Roatán,Roatan,16.33,-86.519,Honduras,HN,HND,7514,-,-,-,-,-,$47.24,"{'Santa Ana': 417, 'Guatemala City': 469, 'Gra...",Americas,Central America,[Spanish],HNL,24.6677


### Dataframe export to json and csv

In [56]:
#se exporta a JSON
travel_cities.to_json("Output/Travel_cities.json",orient='records')

In [57]:
#se exporta a CSV
travel_cities.to_csv("Output/Travel_cities.csv",index = False)

### 6. Sixth source: Tripadvisor (web scrapping with selenium) / Not implemented due to technical problems
### (Here you can check the attemp)
https://www.tripadvisor.com/

In [58]:
#creamos una lista de ciudades y paises para buscar en Tripadvisor
cities = list(travel_cities["City"])
countries = list(travel_cities["Country"])
cities_countries = list(zip(cities,countries))
print(cities_countries[:10])

[('Tokyo', 'Japan'), ('Osaka', 'Japan'), ('New York', 'United States'), ('Los Angeles', 'United States'), ('Chicago', 'United States'), ('Miami', 'United States'), ('Dallas', 'United States'), ('Philadelphia', 'United States'), ('Houston', 'United States'), ('Washington', 'United States')]


In [59]:
#abrimos el navegador de Selenium
navegador = webdriver.Chrome()

In [None]:
places_tripadvisor = []

for i in cities_countries[:1]:
    try:
        navegador.get('https://www.tripadvisor.com/')
        try:
            things_to_do = WebDriverWait(navegador, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="lithium-root"]/main/div[1]/div[1]/div/div/div[3]/a'))
            )
            time.sleep(2)
        finally:
            things_to_do.click()
        try:
            buscador = WebDriverWait(navegador, 10).until(
                EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/div/form/input[1]'))
            )
            time.sleep(2)
        finally:
            buscador.send_keys(f"{i[0]} {i[1]}")
        try:
            opcion = WebDriverWait(navegador, 10).until(
                EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/div/form/div/a[1]'))
            )
            time.sleep(2)
        finally:
            opcion.click()
        try:
            best_things = WebDriverWait(navegador, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, '_1h6gevVw'))
            )
            time.sleep(2)
        finally:
            places = best_things.find_elements_by_tag_name("li")   
        top_places =[]
        for x in places:
            name = x.text.split("\n")[0].split(". ")[1]
            stars = x.find_element_by_class_name("ui_bubble_rating").get_attribute("class").split("_")[-1]
            stars = int(stars)/10
            reviews = x.text.split("\n")[1]
            link = x.find_element_by_tag_name("a").get_attribute("href")
            places_dict = {"Name":name,"Rating":stars,"Reviews":reviews,"Link":link}
            top_places.append(places_dict)
        places_tripadvisor.append({"City":i[0],"Country":i[1],"Top 10 attractions (by Tripadvisor)": top_places})
    except:
        places_tripadvisor.append({"City":i[0],"Country":i[1],"Top 10 attractions (by Tripadvisor)": []})
        print("Error:",i[0],i[1])
    time.sleep(2)

### Step-by-step test

In [60]:
navegador.get('https://www.tripadvisor.com/')

buscador = navegador.find_element_by_xpath('//*[@id="lithium-root"]/main/div[2]/div/div/div[2]/div[2]/div/form/input[1]')

In [61]:
buscador.send_keys("Quito Ecuador")

In [62]:
opcion = navegador.find_element_by_xpath('//*[@id="lithium-root"]/main/div[2]/div/div/div[2]/div[2]/div[2]/form/div/a[1]')


In [63]:
opcion.click()

In [64]:
things_to_do = navegador.find_element_by_xpath('//*[@id="lithium-root"]/main/div[2]/div/div/div[3]/a')

In [65]:
things_to_do.click()

In [66]:
best_things = navegador.find_element_by_xpath('//*[@id="FILTERED_LIST"]/div[2]')

In [67]:
places = best_things.find_elements_by_tag_name("li")

In [68]:
top_places =[]
for i in places:
    name = i.text.split("\n")[0].split(". ")[1]
    stars = i.find_element_by_class_name("ui_bubble_rating").get_attribute("class").split("_")[-1]
    stars = int(stars)/10
    reviews = i.text.split("\n")[1]
    link = i.find_element_by_tag_name("a").get_attribute("href")
    
    places_dict = {"Name":name,"Rating":stars,"Reviews":reviews,"Link":link}
    top_places.append(places_dict)

print(top_places)

[{'Name': 'Museo Templo del Sol Pintor Ortega Maila', 'Rating': 5.0, 'Reviews': '7,031 reviews', 'Link': 'https://www.tripadvisor.com/Attraction_Review-g294308-d6965734-Reviews-Museo_Templo_del_Sol_Pintor_Ortega_Maila-Quito_Pichincha_Province.html'}, {'Name': 'TeleferiQo Teleferico Quito', 'Rating': 4.5, 'Reviews': '4,536 reviews', 'Link': 'https://www.tripadvisor.com/Attraction_Review-g294308-d602301-Reviews-TeleferiQo_Teleferico_Quito-Quito_Pichincha_Province.html'}, {'Name': 'Fundacion Iglesia de la Compania', 'Rating': 4.5, 'Reviews': '4,961 reviews', 'Link': 'https://www.tripadvisor.com/Attraction_Review-g294308-d314181-Reviews-Fundacion_Iglesia_de_la_Compania-Quito_Pichincha_Province.html'}, {'Name': 'Quito Old Town', 'Rating': 4.5, 'Reviews': '4,770 reviews', 'Link': 'https://www.tripadvisor.com/Attraction_Review-g294308-d315636-Reviews-Quito_Old_Town-Quito_Pichincha_Province.html'}, {'Name': 'Basilica del Voto Nacional', 'Rating': 4.5, 'Reviews': '3,845 reviews', 'Link': 'https