# Web Project
## The goal of this project is to build a database of cities with usefull information, to plan your next trip (especially for backpackers)

### 1. First source: a public database of cities in the world (.csv)
https://www.kaggle.com/dataset/f66386cd35268fd2ae9c7c03e6e4d93c9b1607265c1adef13f99a76e420be997/version/1

In [180]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as bs
import warnings

In [2]:
cities = pd.read_csv('Data/worldcities.csv')

In [3]:
cities.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519


In [4]:
cities.drop(['admin_name', 'capital',"id"], axis=1, inplace=True)

In [5]:
cities.columns = ['Original name', 'City', 'Lat', 'Lon', 'Country', 'ISO2', 'ISO3', 'Population']

In [6]:
cities.drop_duplicates(subset=['City', 'Country'], keep='first',inplace=True)

In [7]:
cities

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population
0,Tokyo,Tokyo,35.6850,139.7514,Japan,JP,JPN,35676000.0
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354922.0
2,Mexico City,Mexico City,19.4424,-99.1310,Mexico,MX,MEX,19028000.0
3,Mumbai,Mumbai,19.0170,72.8570,India,IN,IND,18978000.0
4,São Paulo,Sao Paulo,-23.5587,-46.6250,Brazil,BR,BRA,18845000.0
...,...,...,...,...,...,...,...,...
15488,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,10.0
15489,Cheremoshna,Cheremoshna,51.3894,30.0989,Ukraine,UA,UKR,0.0
15490,Ambarchik,Ambarchik,69.6510,162.3336,Russia,RU,RUS,0.0
15491,Nordvik,Nordvik,74.0165,111.5100,Russia,RU,RUS,0.0


### 2. Second source: a public database of prices by city (web scrapping)
https://www.numbeo.com/cost-of-living/prices_by_city.jsp?displayCurrency=USD&itemId=118&itemId=15&itemId=11&itemId=13&itemId=1

In [8]:
url = 'https://www.numbeo.com/cost-of-living/prices_by_city.jsp?displayCurrency=USD&itemId=118&itemId=15&itemId=11&itemId=13&itemId=1'
resp = requests.get(url)
sopa = bs(resp.content, "html.parser")

In [9]:
table = sopa.find("table",{"id":"t2"})

In [10]:
filas = table.findAll("tr")

In [11]:
filas.pop(0)

<tr>
<th><div style="font-size: 80%; vertical-align: middle;">Rank</div></th>
<th><div class="font_in_table_headers">City</div></th><th><div class="font_in_table_headers">Meal, Inexpensive Restaurant</div></th><th><div class="font_in_table_headers">Eggs <br/>(regular) <br/>(12)</div></th><th><div class="font_in_table_headers">Water <br/>(1.5 liter bottle)</div></th><th><div class="font_in_table_headers">Domestic Beer <br/>(0.5 liter bottle)</div></th><th><div class="font_in_table_headers">Banana <br/>(1kg)</div></th></tr>

In [12]:
ciudades = []

for i in filas:
    city = i.findAll("td")[1].text.split(", ")[0].replace("Tel Aviv-Yafo","Tel Aviv")
    city = re.sub("\(.*\)","",city).strip()
    country = i.findAll("td")[1].text.split(", ")[-1].strip()
    meal = i.findAll("td")[2].text.strip()
    eggs = i.findAll("td")[3].text.strip()
    water = i.findAll("td")[4].text.strip()
    beer = i.findAll("td")[5].text.strip()
    banana = i.findAll("td")[6].text.strip()
    
    row = {"City": city,"Country": country, "Meal (Inexpensive Restaurant)": meal,"Eggs (12)": eggs,"Water (1.5 liter bottle)": water,"Domestic Beer": beer,"Banana (1kg)": banana}
    ciudades.append(row)
    
ciudades_df = pd.DataFrame(ciudades)
ciudades_df

Unnamed: 0,City,Country,Meal (Inexpensive Restaurant),Eggs (12),Water (1.5 liter bottle),Domestic Beer,Banana (1kg)
0,Saint Petersburg,Russia,6.82,1.14,0.58,0.90,0.89
1,Samara,Russia,6.82,0.96,0.44,0.73,0.92
2,Algiers,Algeria,3.12,1.14,0.24,1.72,1.91
3,Saratov,Russia,5.46,1.02,0.37,0.85,0.82
4,Banja Luka,Bosnia And Herzegovina,4.54,1.80,0.59,0.60,1.32
...,...,...,...,...,...,...,...
418,Lviv,Ukraine,4.39,1.18,0.50,0.77,1.13
419,Novosibirsk,Russia,5.46,0.97,0.48,0.80,1.06
420,Brussels,Belgium,18.36,3.24,1.26,1.66,2.08
421,Surabaya,Indonesia,2.03,1.71,0.35,2.37,1.31


In [13]:
result = pd.merge(cities, ciudades_df, on=["City","Country"])

In [14]:
result

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant),Eggs (12),Water (1.5 liter bottle),Domestic Beer,Banana (1kg)
0,Tokyo,Tokyo,35.6850,139.7514,Japan,JP,JPN,35676000.0,8.02,2.36,1.24,3.01,3.65
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354922.0,21.00,3.46,2.08,3.23,2.86
2,Mexico City,Mexico City,19.4424,-99.1310,Mexico,MX,MEX,19028000.0,6.31,1.57,0.66,1.08,1.01
3,Mumbai,Mumbai,19.0170,72.8570,India,IN,IND,18978000.0,4.00,0.92,0.38,2.07,0.70
4,São Paulo,Sao Paulo,-23.5587,-46.6250,Brazil,BR,BRA,18845000.0,5.04,1.55,0.49,1.00,0.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,Zadar,Zadar,44.1201,15.2623,Croatia,HR,HRV,71258.0,11.01,2.83,1.19,1.37,1.81
370,Dubrovnik,Dubrovnik,42.6609,18.0914,Croatia,HR,HRV,36994.0,16.52,6.81,3.33,2.71,1.97
371,Paphos,Paphos,34.7559,32.4225,Cyprus,CY,CYP,35961.0,9.48,3.06,0.85,1.32,1.62
372,Sliema,Sliema,35.9125,14.5019,Malta,MT,MLT,,17.77,3.56,0.96,2.23,2.30


### 3. Third source: a public list of backpacker daily budget by city (web scrapping)
https://www.priceoftravel.com/world-cities-by-price-backpacker-index/

In [15]:
url = 'https://www.priceoftravel.com/world-cities-by-price-backpacker-index/'
resp = requests.get(url)
sopa = bs(resp.content, "html.parser")

In [16]:
lst = sopa.find("div",{"class":"bpiidx_list"}).findAllNext("div",{"id":"bpi_row1"})

In [17]:
cities_budget = []

for i in lst:
    city = i.findAll("div",{"class":"bpidx"})[2].text.split(", ")[0]
    city = re.sub("\(.*\)","",city).replace("Saigon","Ho Chi Minh City").replace("Luxembourg City","Luxembourg").replace("Miami Beach","Miami").replace(" D.C.","").replace("New York City","New York").strip()
    country = i.findAll("div",{"class":"bpidx"})[2].text.split(", ")[-1]
    country = country.replace("Viet Nam","Vietnam").replace("Czechia","Czech Republic").strip()
    country = re.sub(" and "," And ",country)
    budget = i.findAll("div",{"class":"bpidx"})[1].text
    
    row = {"City": city,"Country": country, "Daily Budget": budget}
    cities_budget.append(row)
    
cities_budget_df = pd.DataFrame(cities_budget)
cities_budget_df
    

Unnamed: 0,City,Country,Daily Budget
0,Hanoi,Vietnam,$19.70
1,Ho Chi Minh City,Vietnam,$20.54
2,Vientiane,Laos,$21.15
3,Pokhara,Nepal,$21.32
4,Hoi An,Vietnam,$21.48
...,...,...,...
132,Boston,United States,$110.52
133,Amsterdam,Netherlands,$115.84
134,Venice,Italy,$120.47
135,New York,United States,$123.58


In [183]:
result = pd.merge(result, cities_budget_df,  on=["City","Country"], how="outer")

In [184]:
result.fillna("-", inplace=True)
result

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant),Eggs (12),Water (1.5 liter bottle),Domestic Beer,Banana (1kg),Daily Budget_x,Closest Cities (km),Daily Budget_y
0,Tokyo,Tokyo,35.685,139.751,Japan,JP,JPN,3.5676e+07,8.02,2.36,1.24,3.01,3.65,$73.06,-,$73.06
1,New York,New York,40.6943,-73.9249,United States,US,USA,1.93549e+07,21.00,3.46,2.08,3.23,2.86,$123.58,-,$123.58
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,1.9028e+07,6.31,1.57,0.66,1.08,1.01,$29.93,-,$29.93
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,1.8978e+07,4.00,0.92,0.38,2.07,0.70,$31.04,-,$31.04
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,1.8845e+07,5.04,1.55,0.49,1.00,0.99,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,-,San Francisco,-,-,United States,-,-,-,-,-,-,-,-,-,-,$107.69
134,-,Boston,-,-,United States,-,-,-,-,-,-,-,-,-,-,$110.52
135,-,Amsterdam,-,-,Netherlands,-,-,-,-,-,-,-,-,-,-,$115.84
136,-,Venice,-,-,Italy,-,-,-,-,-,-,-,-,-,-,$120.47


In [185]:
result = result.head()
result

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant),Eggs (12),Water (1.5 liter bottle),Domestic Beer,Banana (1kg),Daily Budget_x,Closest Cities (km),Daily Budget_y
0,Tokyo,Tokyo,35.685,139.751,Japan,JP,JPN,35676000.0,8.02,2.36,1.24,3.01,3.65,$73.06,-,$73.06
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354900.0,21.0,3.46,2.08,3.23,2.86,$123.58,-,$123.58
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,19028000.0,6.31,1.57,0.66,1.08,1.01,$29.93,-,$29.93
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,18978000.0,4.0,0.92,0.38,2.07,0.7,$31.04,-,$31.04
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,18845000.0,5.04,1.55,0.49,1.0,0.99,-,-,-


In [207]:
import math

def closest_cities(row):
    try:
        rad = 6373.0
        lat1 = math.radians(row["Lat"])
        lon1 = math.radians(row["Lon"])
        closest_d = []
        closest_c = []
        for i in range(len(result)+1):
            lat2 = math.radians(result[i:i+1]["Lat"])
            lon2 = math.radians(result[i:i+1]["Lon"])
            dlon = lon2 - lon1
            dlat = lat2 - lat1
            a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
            c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
            distance = int(rad * c)
            closest_d.append(distance)
            ciu = str(result.loc[i,["City"]]).split("\n")[0].split("    ")[1]
            closest_c.append(ciu)
            print("termin",i,ciu,distance)
        try:
            print(closest_d)
        except:
            print("falla dist")
        zipped = zip(closest_c,closest_d)
        sorted_zip = sorted(zipped, key=lambda tup: tup[1])
        cerc = sorted_zip[1:6]
        print("Bien", cerc)
        #return cerc
    except:
        #return "-"
        print("-")

In [208]:
result["Closest Cities (km)"] = result.apply(lambda x: closest_cities(x), axis = 1)

termin 0 Tokyo 0
termin 1 New York 10855
termin 2 Mexico City 11304
termin 3 Mumbai 6736
termin 4 Sao Paulo 18539
-
termin 0 Tokyo 10855
termin 1 New York 0
termin 2 Mexico City 3363
termin 3 Mumbai 12544
termin 4 Sao Paulo 7684
-
termin 0 Tokyo 11304
termin 1 New York 3363
termin 2 Mexico City 0
termin 3 Mumbai 15655
termin 4 Sao Paulo 7436
-
termin 0 Tokyo 6736
termin 1 New York 12544
termin 2 Mexico City 15655
termin 3 Mumbai 0
termin 4 Sao Paulo 13773
-
termin 0 Tokyo 18539
termin 1 New York 7684
termin 2 Mexico City 7436
termin 3 Mumbai 13773
termin 4 Sao Paulo 0
-


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["Closest Cities (km)"] = result.apply(lambda x: closest_cities(x), axis = 1)


In [192]:
result

Unnamed: 0,Original name,City,Lat,Lon,Country,ISO2,ISO3,Population,Meal (Inexpensive Restaurant),Eggs (12),Water (1.5 liter bottle),Domestic Beer,Banana (1kg),Daily Budget_x,Closest Cities (km),Daily Budget_y
0,Tokyo,Tokyo,35.685,139.751,Japan,JP,JPN,35676000.0,8.02,2.36,1.24,3.01,3.65,$73.06,,$73.06
1,New York,New York,40.6943,-73.9249,United States,US,USA,19354900.0,21.0,3.46,2.08,3.23,2.86,$123.58,,$123.58
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,19028000.0,6.31,1.57,0.66,1.08,1.01,$29.93,,$29.93
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,18978000.0,4.0,0.92,0.38,2.07,0.7,$31.04,,$31.04
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,18845000.0,5.04,1.55,0.49,1.0,0.99,-,,-


In [None]:
result.drop