# Zusammenbringen der Datensätze SimRa und osm_surface ohne service Straßen

Unser Vorhersage-Modell soll anhand von Features, wie z.B. Straßentyp oder Höchstgeschwindigkeit, Vorhersagen zum Gefahrenpotential einer Strecke machen.   
Dazu müssen wir den SimRa-Datensatz und unsere Features aus OSM vereinen.  
  
Da wir unser Training auch ohnne den Straßentyp "service" durchführen möchten, joinen wir hier die entsprechenden Datensätze.

In [4]:
import geopandas as gpd


# Lade die GeoJSON-Datei
osm_surface = gpd.read_file("../../data/processed_data/cycle_net_berlin_cleaned_surface_noservice.geojson")
simra_data = gpd.read_file("../../data/processed_data/simra_within_berlin.geojson")

In [5]:
# Sicherstellen, dass die Geometriespalten korrekt gesetzt sind
simra_data = simra_data.set_geometry('geometry')
osm_surface = osm_surface.set_geometry('geometry')

#### Überprüfung der Koordinatensysteme beider Datensätze

In [3]:
simra_data.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [6]:
osm_surface.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

#### Verknüpfen der Datensätze mit Join

Verwenden des left Joins:

Behalten aller Einträge aus dem ersten (linken) DataFrame (simra) und hinzufügen passender Einträge aus dem zweiten (rechten) DataFrame (osm_surface)Einträge im linken DataFrame ohne passende Einträge im rechten DataFrame erhalten NaN-Werte für die Spalten aus dem rechten DataFrame.

In [9]:
# Räumliche Verknüpfung zwischen Polygondaten und den OSM/Fahrradnetzwerk/Highway-Daten
simra_surface = gpd.sjoin(simra_data, osm_surface, how='left', predicate='intersects')

In [10]:
simra_surface.head()

Unnamed: 0,id,type,score,incidents,rides,markers,geometry,index_right,surface_category
0,[79310].0,Street,0.0,0,57,[ ],"POLYGON ((13.37410 52.53031, 13.37421 52.53020...",23564.0,asphalt
0,[79310].0,Street,0.0,0,57,[ ],"POLYGON ((13.37410 52.53031, 13.37421 52.53020...",68267.0,asphalt
0,[79310].0,Street,0.0,0,57,[ ],"POLYGON ((13.37410 52.53031, 13.37421 52.53020...",162.0,asphalt
1,"[196724641, 196725586, 866264912].0",Junction,0.000649,1,1541,"[ [ [ 13.417860660000001, 52.514469009999999 ]...","POLYGON ((13.41751 52.51461, 13.41779 52.51442...",11276.0,asphalt
1,"[196724641, 196725586, 866264912].0",Junction,0.000649,1,1541,"[ [ [ 13.417860660000001, 52.514469009999999 ]...","POLYGON ((13.41751 52.51461, 13.41779 52.51442...",11278.0,asphalt


### Überprüfen der NaN Werte¶

In [11]:
# Zeilen, die NaN in 'surface' (bzw. in 'index_right') enthalten
nan_values = simra_surface[simra_maxspeed['surface_category'].isna()]

print(f"Anzahl der Zeilen mit NaN in 'surface_category': {len(nan_values)}")

Anzahl der Zeilen mit NaN in 'surface_category': 572


In [12]:
nan_values.head()

Unnamed: 0,id,type,score,incidents,rides,markers,geometry,index_right,surface_category
13,[295930].0,Street,0.0,0,140,[ ],"POLYGON ((13.35420 52.51651, 13.35440 52.51664...",,
45,[290746].0,Street,0.030769,1,143,"[ [ [ 13.368005419999999, 52.51545711 ], ""Datu...","POLYGON ((13.36851 52.51550, 13.36851 52.51552...",,
85,[247909].0,Street,0.006897,1,145,"[ [ [ 13.34922181, 52.541840720000003 ], ""Datu...","POLYGON ((13.34905 52.54175, 13.34909 52.54171...",,
89,[291472].0,Street,0.0,0,64,[ ],"POLYGON ((13.31914 52.45757, 13.31913 52.45756...",,
96,[260620].0,Street,0.008929,1,112,"[ [ [ 13.368579860000001, 52.525969879999998 ]...","POLYGON ((13.36877 52.52606, 13.36877 52.52608...",,


In [14]:
nan_values_score = simra_surface[(simra_surface['surface_category'].isna()) & (simra_surface['score'] != 0)]

In [15]:
nan_values_score.count()

id                  92
type                92
score               92
incidents           92
rides               92
markers             92
geometry            92
index_right          0
surface_category     0
dtype: int64

In [16]:
nan_values_score

Unnamed: 0,id,type,score,incidents,rides,markers,geometry,index_right,surface_category
45,[290746].0,Street,0.030769,1,143,"[ [ [ 13.368005419999999, 52.51545711 ], ""Datu...","POLYGON ((13.36851 52.51550, 13.36851 52.51552...",,
85,[247909].0,Street,0.006897,1,145,"[ [ [ 13.34922181, 52.541840720000003 ], ""Datu...","POLYGON ((13.34905 52.54175, 13.34909 52.54171...",,
96,[260620].0,Street,0.008929,1,112,"[ [ [ 13.368579860000001, 52.525969879999998 ]...","POLYGON ((13.36877 52.52606, 13.36877 52.52608...",,
123,[254291].0,Street,0.007371,3,407,"[ [ [ 13.415521223372453, 52.52151084136414 ],...","POLYGON ((13.41514 52.52118, 13.41515 52.52116...",,
187,[297689].0,Street,0.006803,1,147,"[ [ [ 13.32194411, 52.511639639999999 ], ""Datu...","POLYGON ((13.32215 52.51151, 13.32217 52.51152...",,
...,...,...,...,...,...,...,...,...,...
15618,[307025].0,Street,0.069231,2,78,"[ [ [ 13.381872230000001, 52.513503790000001 ]...","POLYGON ((13.38215 52.51305, 13.38218 52.51305...",,
15890,[272200].0,Street,0.008560,1,514,"[ [ [ 13.40600347, 52.489642289999999 ], ""Datu...","POLYGON ((13.40628 52.48962, 13.40595 52.48962...",,
15960,[219732].0,Street,0.018868,1,53,"[ [ [ 13.325165739999999, 52.512279069999998 ]...","POLYGON ((13.32503 52.51210, 13.32532 52.51217...",,
16081,[293463].0,Street,0.011111,1,90,"[ [ [ 13.4168102, 52.510976550000002 ], ""Datum...","POLYGON ((13.41716 52.51089, 13.41717 52.51090...",,


## Entfernen aller NaN-Werte

In [17]:
cleaned_simra_surface = simra_surface.dropna(subset=['surface_category'])

print(f"Anzahl der verbleibenden Zeilen nach dem Entfernen der NaN-Werte: {len(cleaned_simra_surface)}")

Anzahl der verbleibenden Zeilen nach dem Entfernen der NaN-Werte: 72095


In [18]:
cleaned_simra_surface.head()

Unnamed: 0,id,type,score,incidents,rides,markers,geometry,index_right,surface_category
0,[79310].0,Street,0.0,0,57,[ ],"POLYGON ((13.37410 52.53031, 13.37421 52.53020...",23564.0,asphalt
0,[79310].0,Street,0.0,0,57,[ ],"POLYGON ((13.37410 52.53031, 13.37421 52.53020...",68267.0,asphalt
0,[79310].0,Street,0.0,0,57,[ ],"POLYGON ((13.37410 52.53031, 13.37421 52.53020...",162.0,asphalt
1,"[196724641, 196725586, 866264912].0",Junction,0.000649,1,1541,"[ [ [ 13.417860660000001, 52.514469009999999 ]...","POLYGON ((13.41751 52.51461, 13.41779 52.51442...",11276.0,asphalt
1,"[196724641, 196725586, 866264912].0",Junction,0.000649,1,1541,"[ [ [ 13.417860660000001, 52.514469009999999 ]...","POLYGON ((13.41751 52.51461, 13.41779 52.51442...",11278.0,asphalt


### Gruppieren der Polyglone und Zusammenfassen der maxspeed-Werte


* Der DataFrame wird nach id und geometry gruppiert. Für jede Gruppe werden die angegebenen Aggregationsfunktionen auf die Spalten angewendet:
  * Für type, score, incidents, rides, markers, und index_right wird der erste Wert in der Gruppe verwendet ('first').
  * Für die surface_category-Spalte wird eine Funktion zum Kombinieren der Werte angwendet.
* Das Ergebnis ist ein DataFrame grouped_data, bei dem die surface-Werte pro Gruppe als zusammenhängender String dargestellt sind, während alle anderen spezifischen Werte beibehalten werden.

In [19]:
cleaned_simra_surface = gpd.GeoDataFrame(cleaned_simra_surface, geometry='geometry')

In [21]:
# Funktion zum Kombinieren der 'maxspeed'-Werte 
def combine_surfaces(x):
    return ', '.join(x)  # Doppelte Einträge bleiben erhalten und werden verbunden

# Gruppieren nach 'id' und 'geometry' und Aggregation
grouped_data = cleaned_simra_surface.groupby(['id', 'geometry']).agg({
    'type': 'first',       # Erster Wert (da alle Werte gleich)
    'score': 'first',      
    'incidents': 'first',  
    'rides': 'first',      
    'markers': 'first',    
    'index_right': 'first',
    'surface_category': combine_surfaces  # Kombinieren der 'highway' Werte
}).reset_index()

In [22]:
# Umwandlung zurück in ein GeoDataFrame
grouped_data = gpd.GeoDataFrame(grouped_data, geometry='geometry', crs=cleaned_simra_surface.crs)

In [23]:
grouped_data.head(3)

Unnamed: 0,id,geometry,type,score,incidents,rides,markers,index_right,surface_category
0,[100049].0,"POLYGON ((13.45412 52.54035, 13.45320 52.53977...",Street,0.0,0,138,[ ],35281.0,"concrete, concrete, concrete, concrete"
1,[100069498].0,"POLYGON ((13.52273 52.50704, 13.52248 52.50690...",Junction,0.0,0,200,[ ],44754.0,"asphalt, asphalt, asphalt"
2,"[100078509, 288268004, 3888645535].0","POLYGON ((13.47754 52.51457, 13.47782 52.51438...",Junction,0.0,0,54,[ ],41983.0,"asphalt, asphalt, asphalt, asphalt, asphalt, a..."


In [24]:
grouped_data.shape

(15722, 9)

### Umgang mit mehreren surface-Werten in einem Polygon
-------

surface Werte könnten anteilig  nach ihrem Vorkommen im Polygon ausgegeben werden  
Diesen Ansatz wollen wir verfolgen und später testen, ob dieses Vorgehen sinnvoll ist

Wir gehen folgendermaßen vor:  
* Zerlegen der surface_category-Spalte.
* Berechnung der Anteile für jeden surface-Typ.
* Erstellung der neuen Kategorien-Spalten.

In [25]:
# Funktion zum Zerlegen der 'surface_category'-Spalte in Listen
grouped_data['surface_list'] = grouped_data['surface_category'].apply(lambda x: x.split(', '))

In [26]:
grouped_data.head()

Unnamed: 0,id,geometry,type,score,incidents,rides,markers,index_right,surface_category,surface_list
0,[100049].0,"POLYGON ((13.45412 52.54035, 13.45320 52.53977...",Street,0.0,0,138,[ ],35281.0,"concrete, concrete, concrete, concrete","[concrete, concrete, concrete, concrete]"
1,[100069498].0,"POLYGON ((13.52273 52.50704, 13.52248 52.50690...",Junction,0.0,0,200,[ ],44754.0,"asphalt, asphalt, asphalt","[asphalt, asphalt, asphalt]"
2,"[100078509, 288268004, 3888645535].0","POLYGON ((13.47754 52.51457, 13.47782 52.51438...",Junction,0.0,0,54,[ ],41983.0,"asphalt, asphalt, asphalt, asphalt, asphalt, a...","[asphalt, asphalt, asphalt, asphalt, asphalt, ..."
3,[100094].0,"POLYGON ((13.46855 52.61490, 13.46841 52.61475...",Street,0.0,0,98,[ ],31020.0,"unpaved, asphalt","[unpaved, asphalt]"
4,[1000].0,"POLYGON ((13.35533 52.51693, 13.35655 52.51683...",Street,0.0,0,130,[ ],308.0,"asphalt, asphalt, asphalt","[asphalt, asphalt, asphalt]"


In [27]:
# Alle einzigartigen `surface`-Typen finden
unique_types = sorted(set(sum(grouped_data['surface_list'].tolist(), [])))

In [28]:
unique_types

['asphalt', 'concrete', 'paving_stone', 'sett', 'unpaved']

In [29]:
# Funktion zur Berechnung der Anteilswerte - berechnet die Anteile der jeweiligen `surface`-Typen pro Zeile
def calculate_surface_ratios(row, surface_types):
    total_count = len(row['surface_list'])
    counts = pd.Series(row['surface_list']).value_counts()
    return {surface: counts.get(surface, 0) / total_count for surface in surface_types}

In [30]:
import pandas as pd

# Anwendung der Funktion auf den GeoDataFrame
surface_ratios = grouped_data.apply(calculate_surface_ratios, axis=1, surface_types=unique_types) # axis=1 --> Fkt. wird aus Zeilen angewendet
ratios_df = pd.DataFrame(list(surface_ratios))

In [31]:
# Zusammenführen der Ergebnisse mit dem ursprünglichen GeoDataFrame
gdf = pd.concat([grouped_data, ratios_df], axis=1)

In [32]:
# Entfernen der temporären Spalte
gdf.drop(columns=['surface_list'], inplace=True)

In [33]:
# Entfernen der temporären Spalte
gdf.drop(columns=['surface_category', 'markers'], inplace=True)

In [34]:
gdf.head(10)

Unnamed: 0,id,geometry,type,score,incidents,rides,index_right,asphalt,concrete,paving_stone,sett,unpaved
0,[100049].0,"POLYGON ((13.45412 52.54035, 13.45320 52.53977...",Street,0.0,0,138,35281.0,0.0,1.0,0.0,0.0,0.0
1,[100069498].0,"POLYGON ((13.52273 52.50704, 13.52248 52.50690...",Junction,0.0,0,200,44754.0,1.0,0.0,0.0,0.0,0.0
2,"[100078509, 288268004, 3888645535].0","POLYGON ((13.47754 52.51457, 13.47782 52.51438...",Junction,0.0,0,54,41983.0,0.846154,0.0,0.0,0.0,0.153846
3,[100094].0,"POLYGON ((13.46855 52.61490, 13.46841 52.61475...",Street,0.0,0,98,31020.0,0.5,0.0,0.0,0.0,0.5
4,[1000].0,"POLYGON ((13.35533 52.51693, 13.35655 52.51683...",Street,0.0,0,130,308.0,1.0,0.0,0.0,0.0,0.0
5,[100120].0,"POLYGON ((13.50803 52.45148, 13.50766 52.45048...",Street,0.0,0,54,31023.0,1.0,0.0,0.0,0.0,0.0
6,[100126].0,"POLYGON ((13.50823 52.45280, 13.50841 52.45267...",Street,0.0,0,169,31024.0,1.0,0.0,0.0,0.0,0.0
7,[100129].0,"POLYGON ((13.50832 52.45291, 13.50806 52.45301...",Street,0.0,0,102,31024.0,1.0,0.0,0.0,0.0,0.0
8,[100144].0,"POLYGON ((13.40858 52.51198, 13.40761 52.51167...",Street,0.001818,1,550,31033.0,1.0,0.0,0.0,0.0,0.0
9,[100154846].0,"POLYGON ((13.51004 52.45376, 13.50972 52.45359...",Junction,0.0,0,414,25302.0,0.833333,0.0,0.166667,0.0,0.0


In [35]:
 #Speichern des gdf

output_filename = "../../data/processed_data/osm_surface_ratios_noservice.geojson"
gdf.to_file(output_filename, driver='GeoJSON')

print(f"Datei erfolgreich gespeichert.")

Datei erfolgreich gespeichert.
