# Bonus Challenge - Approach

- Scrape names and population numbers of all 97 localities ("Ortsteile") in Berlin
    - From Wikipedia: https://en.wikipedia.org/wiki/Boroughs_and_neighborhoods_of_Berlin
    <br>
    <br>
- Scrape zip codes and corresponding localities from another site
    - https://www.berlinstadtservice.de/xinh/Postleitzahlen_Berlin_Alphabetisch.html
<br>
<br>
- Use the business/search endpoint from the Yelp API to collect results for Berlin with search terms "Italian" and "Pizza", results include name, rating and zip code of restaurant adress
<br>
<br>
- Merge all this info together, include only restaurants with rating 4 or higher and then look for the locality with the most people per (good) Italian restaurant. This region could be seen as "underserved" and a offer a potential opening for a new restaurant.

In [26]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Scrape Wiki Page

In [27]:
# Get the full html of the page
wiki_resp = requests.get("https://en.wikipedia.org/wiki/Boroughs_and_neighborhoods_of_Berlin")
wiki_soup = BeautifulSoup(wiki_resp.content, "html.parser")

In [28]:
# The <dl> tag is the table subheading for each of the 12 burroughs followed by a table of their localities
all_dls = wiki_soup.find_all("dl")

In [29]:
#loop through all <dl> tags i.e. burroughs and collect name and population from the current <tr> (cur_row)
# store both values as tuples and collect them in a list
lst_of_name_pop = []
for dl in all_dls:
    cur_table = dl.find_next("tbody").find_next("tr")
    
    cur_row = cur_table.find_next("tr")
    
    i = 0
    while i <= 15:
        pop = cur_row.find_next("td").find_next("td").find_next("td").text
        name = cur_row.find_next("a").find_next("a").text
        
        name_pop = (name, pop)
        lst_of_name_pop.append(name_pop)
        #names.append(cur_row.find_next("a").find_next("a").text)
        #populations.append(cur_row.find_next("td").find_next("td").find_next("td").text)
        i += 1
        cur_row = cur_row.find_next("tr")

In [30]:
# Because the tables are of different length but the while loop is calibrated for the longest one, there are duplicates
unique_name_pop = list(set(lst_of_name_pop))

In [31]:
# Through the mismatch of lengths there are also some pairings of names and other names. 
# Get rid of them and separate the real values into lists for names and pops
names = []
pops = []

# We try to convert the population number to float, if that fails, we discard it and if it works both values name and pop
# are added to the lists
for tup in unique_name_pop:
    
    try:
        float(tup[1].replace("\n", "").replace(",", ""))
    except:
        continue
    else:
        names.append(tup[0])
        pops.append(tup[1])

In [32]:
# Replace the new line character in pops and get rid of "Tausenderkomma"
pops = [float(pop.replace("\n", "").replace(",", "")) for pop in pops]

In [33]:
# Make it a Dataframe!
df_pops_by_locality = pd.DataFrame.from_dict({"locality": names, "population": pops})

In [34]:
df_pops_by_locality.head()

Unnamed: 0,locality,population
0,Lichtenrade,49451.0
1,Weißensee,45485.0
2,Niederschönhausen,26903.0
3,Rudow,41040.0
4,Wartenberg,2433.0


## Scrape zip codes and localities

In [35]:
url = "https://www.berlinstadtservice.de/xinh/Postleitzahlen_Berlin_Alphabetisch.html"
resp_scrape = requests.get(url)

soup = BeautifulSoup(resp_scrape.content, "html.parser")

In [36]:
# Conveniently all zip code entries share a unique tag
table_entries = soup.find_all("td")

zip_codes = []

# loop through entries to extract the full text, containing zip codes and name in one string
for ent in table_entries:
    zip_codes.append(ent.text)

#clean it up
clean_zip = []

for ent in zip_codes:
    ent = ent.replace("B-", "")
    clean_zip.append(ent)

#first part of the string split is the zip code and second part the name
plz = []
localities = []
for ent in clean_zip:
    plz.append(ent.split()[0])
    localities.append(" ".join(ent.split()[1:]))

In [37]:
### df from localities and postcodes
df_zip_local = pd.DataFrame.from_dict({"locality": localities, "zip_code": plz})

## Yelp API

In [38]:
# API key is in a non-tracked txt file, so the key is not on GitHub

with open("key.txt", "r") as file:
    key = file.read()

key = key.strip("\n")

In [39]:
# empty dict of businesses with key "name", "rating", "zip code"
restaurants = {"name": [], "rating":[], "zip_code": []}

# API calls -- 50 results per call, with offset increasing so we get new results. In total 1000 results
offset = 0
while offset <= 950:
    url = f"https://api.yelp.com/v3/businesses/search?location=berlin&term=food&categories=italian%2C%20pizza&limit=50&offset={offset}"

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {key}"
    }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    
    resp_dict = response.json()
    
    for ent in resp_dict["businesses"]:
        restaurants["name"].append(ent["alias"])
        restaurants["rating"].append(ent["rating"])
        restaurants["zip_code"].append(ent["location"]["zip_code"])
    offset += 50


200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [40]:
# Make dataframe of the 1000 restaurants, including name, rating and zip_code

df_rest = pd.DataFrame.from_dict(restaurants)

## Bringing it all together..

In [41]:
# First merge locality names to restaurants based on zip code
df_restaurants = df_rest.merge(df_zip_local, on= "zip_code", how= "inner")

#drop restaurants with rating under 4
top_rated = df_restaurants.loc[df_restaurants.rating >= 4, ]

In [42]:
# Before we merge the population info we aggregate by locality and count numbers of restaurants per locality
agg_rest = top_rated.groupby(by="locality").count().reset_index()

#merge this with the population info on locality
per_capita = agg_rest.merge(df_pops_by_locality, how="inner", on = "locality")

# calculate the amount of people per restaurant
per_capita["per_cap"] = per_capita.population/per_capita.name

#sort by this new variable
per_capita = per_capita.sort_values(by="per_cap", ascending=False)

#e voila -- STRONG CAVEATS..
per_capita.head()

Unnamed: 0,locality,name,rating,zip_code,population,per_cap
39,Marzahn,1,1,1,102398.0,102398.0
22,Hellersdorf,1,1,1,72602.0,72602.0
2,Altglienicke,1,1,1,26101.0,26101.0
4,Biesdorf,1,1,1,24543.0,24543.0
29,Köpenick,3,3,3,59201.0,19733.666667
