In [75]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from IPython.display import display_html

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
# import folium
import folium # map rendering library

#!conda install -c conda-forge xlrd --yes
#!conda install -c conda-forge lxml --yes
#!conda install -c conda-forge beautifulsoup4 --yes
#!conda install -c conda-forge geocoder --yes
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab

print('Libraries imported.')

Libraries imported.


#### Preparing crime data

In [2]:
# read in the excel table
xls = pd.ExcelFile('crimeBerlin.xlsx')
df1 = pd.read_excel(xls, 'Titel')
df2 = pd.read_excel(xls, 'Inhaltsverzeichnis')

In [3]:
# read the single sheets into a data frame
# rename some colums and drop some rows
df3 = pd.read_excel(xls, 'Fallzahlen_2018') # up to 2018
df3.drop(df3.tail(2).index,inplace=True) # drop last 2 rows
df3.drop(df3.head(4).index,inplace=True) # drop first 4 rows
newcols=['Borough','Neighborhood', 'Total records', 'Robbery', 'Street robbery', \
      'Total physical injury', 'Serious bodily harm', 'Coercion', 'Total theft', \
      'Bicycle theft', 'Burglary', 'Property damage', 'Drug offenses']
df3 = df3.drop(df3.columns[[9,10,13,14,16,18]], axis=1)
df3.columns=newcols
df3.reset_index(drop=True, inplace=True)
df3.head()

Unnamed: 0,Borough,Neighborhood,Total records,Robbery,Street robbery,Total physical injury,Serious bodily harm,Coercion,Total theft,Bicycle theft,Burglary,Property damage,Drug offenses
0,10000,Mitte,85227,733,449,7450,1854,2068,37802,4173,775,5528,3792
1,10111,Tiergarten Süd,5171,70,58,416,125,103,2540,310,39,284,273
2,10112,Regierungsviertel,9146,34,16,519,110,114,3883,372,35,407,133
3,10113,Alexanderplatz,19275,130,80,1531,366,309,10144,880,133,1036,971
4,10114,Brunnenstraße Süd,4207,26,14,286,60,69,1817,314,56,442,69


In [4]:
# replace LOR-keys with the corresponding borough names
lorkeys=['010000','020000','030000','040000','050000','060000','070000','080000','090000','100000','110000','120000']
myIndex=[]
for tkey in lorkeys:
    df3ind=df3.index[df3['Borough'] == tkey].to_list()
    myIndex.append(df3ind[0])
myIndex = [ int(x) for x in myIndex ]
myBorough=['Mitte','Friedrichshain-Kreuzberg','Pankow', 'Charlottenburg-Wilmersdorf', 'Spandau', 'Steglitz-Zehlendorf', \
          'Tempelhof-Schöneberg', 'Neukölln', 'Treptow-Köpenick', 'Marzahn-Hellersdorf', 'Lichtenberg','Reinickendorf']
myb=df3['Borough'].astype(int).to_list()
start=10000
k=0
myBor=[]
for i in myb:
    if i < start+10000:
        myBor.append(myBorough[k])
    else:
        k=k+1
        myBor.append(myBorough[k])
        start=start+10000
        
df3['Borough']=myBor
df3.drop(myIndex,axis=0, inplace=True)
df3.head()

[0, 12, 22, 40, 59, 70, 80, 89, 101, 123, 134, 149]


Unnamed: 0,Borough,Neighborhood,Total records,Robbery,Street robbery,Total physical injury,Serious bodily harm,Coercion,Total theft,Bicycle theft,Burglary,Property damage,Drug offenses
1,Mitte,Tiergarten Süd,5171,70,58,416,125,103,2540,310,39,284,273
2,Mitte,Regierungsviertel,9146,34,16,519,110,114,3883,372,35,407,133
3,Mitte,Alexanderplatz,19275,130,80,1531,366,309,10144,880,133,1036,971
4,Mitte,Brunnenstraße Süd,4207,26,14,286,60,69,1817,314,56,442,69
5,Mitte,Moabit West,7257,71,38,653,164,215,2470,447,83,496,435


In [5]:
# writing crime data into csv file
df3.to_csv("berlincrime2018.csv")

#### Preparing school data

In [6]:
# load the web site
from bs4 import BeautifulSoup
import re
url='https://www.gymnasium-berlin.net/abiturdaten/2018'
content = requests.get(url)
soup = BeautifulSoup(content.text, 'html.parser')

myList=[]

for br in soup.find_all("br"):
        br.replace_with("\n\n")
        
rows = soup.find_all('tr') # Extract and return first occurrence of tr
for row in rows:
    text = str(row.get_text())
    b=text.split("\n\n")
    myList.append(b)

In [7]:
# create the data frame
df1=pd.DataFrame(myList)
df1.columns=['T','ID','School','Neighborhood','Grade']
df1.drop(df1.head(1).index,inplace=True)
df1.drop(columns=['T','ID'],inplace=True)
df1 = df1.replace('\n','', regex=True)
df1.head()

Unnamed: 0,School,Neighborhood,Grade
1,Französisches Gymnasium,Tiergarten,167
2,Heinrich-Hertz-Gymnasium,Friedrichshain,183
3,Georg-Friedrich-Händel-Gymnasium,Friedrichshain,189
4,Rosa-Luxemburg-Gymnasium,Pankow,190
5,Arndt-Gymnasium Dahlem,Dahlem,194


In [8]:
# writing school data into csv file
df1.to_csv("berlinschool2019.csv")

#### Preparing rental costs data

In [10]:
# load the web site
url='https://www.wohnungsboerse.net/mietspiegel-Berlin/2825'
content = requests.get(url)
soup = BeautifulSoup(content.text, 'html.parser')

In [11]:
#extract the corresponing table
names=[]
for tr in soup.find_all('tr'):
    td = [td for td in tr.stripped_strings]
    if len(td)  > 2:
        a=td[0]
        b=""
        b=b.join(a)    #b=str(a[0]).strip('[]')
        names.append(b)

['Adlershof (Treptow)', 'Alt-Hohenschönhausen (Hohenschönhausen)', 'Alt-Treptow', 'Altglienicke (Treptow)', 'Baumschulenweg (Treptow)', 'Biesdorf (Marzahn)', 'Blankenburg (Weißensee)', 'Bohnsdorf (Treptow)', 'Borsigwalde', 'Britz (Neukölln)', 'Buch (Pankow)', 'Buckow (Neukölln)', 'Charlottenburg (Charlottenburg)', 'Dahlem (Zehlendorf)', 'Französisch Buchholz (Pankow)', 'Friedenau (Schöneberg)', 'Friedrichsfelde (Lichtenberg)', 'Friedrichshagen (Köpenick)', 'Friedrichshain (Friedrichshain)', 'Frohnau (Reinickendorf)', 'Gesundbrunnen', 'Gropiusstadt', 'Grünau (Köpenick)', 'Grunewald (Wilmersdorf)', 'Hakenfelde', 'Halensee', 'Hansaviertel', 'Haselhorst (Spandau)', 'Heiligensee (Reinickendorf)', 'Heinersdorf (Weißensee)', 'Hellersdorf (Hellersdorf)', 'Hermsdorf (Reinickendorf)', 'Johannisthal (Treptow)', 'Karlshorst (Lichtenberg)', 'Karow (Weißensee)', 'Kaulsdorf (Hellersdorf)', 'Kladow (Spandau)', 'Konradshöhe (Reinickendorf)', 'Köpenick (Köpenick)', 'Kreuzberg (Kreuzberg)', 'Lankwitz (St

In [12]:
rents=[]
for tr in soup.find_all('tr'):
    td = [td for td in tr.stripped_strings]
    if len(td)  > 2:
        a=td[2]
        b=""
        b=b.join(a)
        rents.append(b)

['12,51\xa0€', '10,76\xa0€', '12,90\xa0€', '9,95\xa0€', '10,86\xa0€', '13,44\xa0€', '15,28\xa0€', '10,33\xa0€', '9,15\xa0€', '9,40\xa0€', '9,86\xa0€', '10,05\xa0€', '15,60\xa0€', '14,57\xa0€', '11,46\xa0€', '12,55\xa0€', '12,65\xa0€', '11,55\xa0€', '15,92\xa0€', '10,41\xa0€', '12,20\xa0€', '8,75\xa0€', '12,14\xa0€', '16,53\xa0€', '12,37\xa0€', '14,94\xa0€', '16,55\xa0€', '8,07\xa0€', '13,10\xa0€', '12,15\xa0€', '9,97\xa0€', '12,29\xa0€', '9,50\xa0€', '10,79\xa0€', '11,03\xa0€', '8,89\xa0€', '11,45\xa0€', '11,27\xa0€', '11,63\xa0€', '16,49\xa0€', '10,07\xa0€', '11,63\xa0€', '9,52\xa0€', '11,20\xa0€', '10,35\xa0€', '10,86\xa0€', '19,83\xa0€', '8,22\xa0€', '8,05\xa0€', '17,23\xa0€', '17,58\xa0€', '9,63\xa0€', '11,80\xa0€', '12,90\xa0€', '12,32\xa0€', '10,57\xa0€', '15,43\xa0€', '10,22\xa0€', '11,91\xa0€', '9,78\xa0€', '17,35\xa0€', '10,95\xa0€', '10,46\xa0€', '11,86\xa0€', '8,43\xa0€', '14,34\xa0€', '11,44\xa0€', '14,54\xa0€', '9,39\xa0€', '9,29\xa0€', '10,20\xa0€', '12,97\xa0€', '11,29\x

In [26]:
# create the data frame
df2 = pd.DataFrame(
    {'Neighborhood': names[29:115],
     'Rental costs': rents[29:115]
    })
df2.head()

Unnamed: 0,Neighborhood,Rental costs
0,Adlershof (Treptow),"12,51 €"
1,Alt-Hohenschönhausen (Hohenschönhausen),"10,76 €"
2,Alt-Treptow,"12,90 €"
3,Altglienicke (Treptow),"9,95 €"
4,Baumschulenweg (Treptow),"10,86 €"


In [27]:
df2 = df2.replace('\xa0€','', regex=True) # replace the euro sign
df2.head()

Unnamed: 0,Neighborhood,Rental costs
0,Adlershof (Treptow),1251
1,Alt-Hohenschönhausen (Hohenschönhausen),1076
2,Alt-Treptow,1290
3,Altglienicke (Treptow),995
4,Baumschulenweg (Treptow),1086


In [28]:
# writing crime data into csv file
df2.to_csv("berlinrenatlcosts2019.csv")

In [41]:
!conda install -c conda-forge postcodes_io_api --yes

Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - postcodes_io_api

Current channels:

  - https://conda.anaconda.org/conda-forge/linux-64
  - https://conda.anaconda.org/conda-forge/noarch
  - https://repo.anaconda.com/pkgs/main/linux-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/free/linux-64
  - https://repo.anaconda.com/pkgs/free/noarch
  - https://repo.anaconda.com/pkgs/r/linux-64
  - https://repo.anaconda.com/pkgs/r/noarch
  - https://repo.anaconda.com/pkgs/pro/linux-64
  - https://repo.anaconda.com/pkgs/pro/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




#### Preparing venue data

In [40]:
# getting post codes and Neighborhoods by a direck loading into panda data frame
url='https://www.berlinstadtservice.de/xinh/Postleitzahlen_Berlin_Alphabetisch.html'
dfs = pd.read_html(url, header=0)
dfs[0].head()

Unnamed: 0,Berlin-Adlershof
0,12487 B-Adlershof
1,12489 B-Adlershof
2,Berlin-Altglienicke
3,12524 B-Altglienicke
4,Berlin-Alt-Treptow


In [37]:
dfl=dfs[0][dfs[0]['Berlin-Adlershof'].str.contains('B-')]
dfl.rename(columns={'Berlin-Adlershof' : 'Neighborhood'}, inplace=True)
dfl = dfl.replace('B-','', regex=True) # replace B-
dfl.head()

Unnamed: 0,Neighborhood
0,12487 Adlershof
1,12489 Adlershof
3,12524 Altglienicke
5,12435 Alt-Treptow
7,12437 Baumschulenweg


In [54]:
dfn = dfl["Neighborhood"].str.split(" ", n = 1, expand = True)
dfn.columns=['Postcode', 'Neighborhood']
dfn['Postcode']=dfn['Postcode'].astype(int)
dfn.head()

Unnamed: 0,Postcode,Neighborhood
0,12487,Adlershof
1,12489,Adlershof
3,12524,Altglienicke
5,12435,Alt-Treptow
7,12437,Baumschulenweg


In [41]:
# writing to a csv file
dfn.to_csv("berlinpostal.csv")

In [43]:
# reading an csv file with coordinates
df4= pd.read_csv('Locations.csv',sep=';')
df4.drop(columns=['Verwaltungszusammenschluss','Regierungsbezirk','ID',\
                  'Landkreis','Staat','Bundesland'],inplace=True)
df4=df4[df4['Ort']=='Berlin']
df4.head()

Unnamed: 0,Ort,Longitude,Latitude,Postcode
7748,Berlin,13.387224,52.533707,10115
7749,Berlin,13.390193,52.518746,10117
7750,Berlin,13.407149,52.532666,10119
7751,Berlin,13.412203,52.523474,10178
7752,Berlin,13.419699,52.514591,10179


In [44]:
# merge coordinates and Neighborhood frames on Postcode
df4.reset_index(drop=True, inplace=True)
df4.head()

Unnamed: 0,Ort,Longitude,Latitude,Postcode
0,Berlin,13.387224,52.533707,10115
1,Berlin,13.390193,52.518746,10117
2,Berlin,13.407149,52.532666,10119
3,Berlin,13.412203,52.523474,10178
4,Berlin,13.419699,52.514591,10179


In [67]:
df4.set_index('Postcode')
dfn.set_index('Postcode')
result = pd.merge(df4, dfn, on=['Postcode'])
result.drop(columns=['Ort'], inplace=True)
result.head(3)

Unnamed: 0,Longitude,Latitude,Postcode,Neighborhood
0,13.387224,52.533707,10115,Mitte
1,13.390193,52.518746,10117,Mitte
2,13.407149,52.532666,10119,Mitte


In [68]:
df = pd.read_csv('Postleit.csv',sep=';')
dft=df.T
dft.reset_index(inplace=True)
dft.head()

newcols=dft.iloc[1].to_list()
dft.columns=newcols
dft.drop(dft.head(2).index,inplace=True)
dft.reset_index(drop=True, inplace=True)
dft.fillna(0,inplace=True)
dft['Mitte'][26:]=0
dft['Mitte'] = pd.to_numeric(dft['Mitte'])

newlist=[]
dfv=dft.values
dfv.astype(int)

myL1=[]
myL2=[]
for tmp in newcols:
    l1=dft[tmp].to_list()
    for el in l1:
        myL1.append(tmp)
        myL2.append(el)
dfc=pd.DataFrame({'Borough': myL1, 'Postcode': myL2})
dd=dfc[(dfc['Postcode']) > 2]
dd['Postcode']=dd['Postcode'].astype(int)
dd.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Borough,Postcode
0,Mitte,10115
1,Mitte,10117
2,Mitte,10119
3,Mitte,10178
4,Mitte,10179


In [69]:
result.set_index('Postcode')
dd.set_index('Postcode')
dfr = pd.merge(result, dd, on=['Postcode'])
dfr.drop(columns=['Postcode'], inplace=True)

In [79]:
dfr.head(5)

Unnamed: 0,Longitude,Latitude,Neighborhood,Borough
0,13.387224,52.533707,Mitte,Mitte
1,13.390193,52.518746,Mitte,Mitte
2,13.407149,52.532666,Mitte,Mitte
3,13.407149,52.532666,Mitte,Pankow
4,13.412203,52.523474,Mitte,Mitte


In [72]:
# create map of Berlin-Mitte  using latitude and longitude values
latitude  = dfr.iloc[1,0]
longitude = dfr.iloc[1,1]
address = 'Berlin, DE'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Berlin are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Berlin are 52.5170365, 13.3888599.


In [80]:
# take a mitte borough
mitte_data = dfr[dfr['Borough'] == 'Mitte'].reset_index(drop=True)
mitte_data.head()

Unnamed: 0,Longitude,Latitude,Neighborhood,Borough
0,13.387224,52.533707,Mitte,Mitte
1,13.390193,52.518746,Mitte,Mitte
2,13.407149,52.532666,Mitte,Mitte
3,13.412203,52.523474,Mitte,Mitte
4,13.419699,52.514591,Mitte,Mitte


In [81]:
# create map of Berlin-Mitte using latitude and longitude values
map_berlin = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(mitte_data['Latitude'],mitte_data['Longitude'], mitte_data['Borough'],mitte_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_berlin)  
    
map_berlin

In [82]:
CLIENT_ID = 'QR03XDVIW1X5MRLXKYCNVVGBH5Q0PLQ4NZMLAPDH345MGEXY' # your Foursquare ID
CLIENT_SECRET = 'T4BALEMPTZJHYTSV2PCTPMZEUHSHTJUXBNZU1JPMNMGRKN5D' # your Foursquare Secret
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

# coordinates of the Central Berlin 
neighborhood_latitude = 52.520008   # neighborhood latitude value
neighborhood_longitude = 13.404954 # neighborhood longitude valu

Your credentails:
CLIENT_ID: QR03XDVIW1X5MRLXKYCNVVGBH5Q0PLQ4NZMLAPDH345MGEXY
CLIENT_SECRET:T4BALEMPTZJHYTSV2PCTPMZEUHSHTJUXBNZU1JPMNMGRKN5D


In [83]:
# create the GET request URL. 
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=QR03XDVIW1X5MRLXKYCNVVGBH5Q0PLQ4NZMLAPDH345MGEXY&client_secret=T4BALEMPTZJHYTSV2PCTPMZEUHSHTJUXBNZU1JPMNMGRKN5D&v=20180604&ll=52.520008,13.404954&radius=500&limit=100'

In [84]:
results = requests.get(url).json()

In [85]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [86]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Radisson Blu,Hotel,52.51958,13.40273
1,Block House,Steakhouse,52.520469,13.405278
2,Waffel oder Becher,Ice Cream Shop,52.521007,13.403815
3,Neptunbrunnen,Fountain,52.519539,13.406925
4,Titus,Board Shop,52.52095,13.405921


In [87]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.
