# k-nearest neighbor (k-NN)

### Steps:

- einlesen der GeoJson-Datei & vorbereiten und anpassen
- Daten aufteilen in 80% Training und 20% Test
- k-Wert festlegen
- Modell erstellen und trainieren


In [1]:
import geopandas as gpd
import pandas as pd

# Pfad zur GeoJSON-Datei angeben
file_path = "../../data/processed_data/simra_osm_no_service_all.geojson"

# GeoJSON-Datei einlesen
gdf = gpd.read_file(file_path)

# GeoJSON-Datei laden
# gdf = gpd.read_file('../../data/processed_data/simra_osm_no_service_all.geojson')

gdf.head()

Unnamed: 0,id,type,score,incidents,rides,index_right,maxspeed,asphalt,concrete,paving_stone,...,footway,highway_rare,living_street,path,primary,residential,secondary,tertiary,track,geometry
0,[100049].0,Street,0.0,0,138,35281.0,50,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,"POLYGON ((13.45412 52.54035, 13.45320 52.53977..."
1,[100069498].0,Junction,0.0,0,200,44754.0,30,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,"POLYGON ((13.52273 52.50704, 13.52248 52.50690..."
2,"[100078509, 288268004, 3888645535].0",Junction,0.0,0,54,41983.0,50,0.846154,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.153846,0.461538,0.0,0.0,"POLYGON ((13.47754 52.51457, 13.47782 52.51438..."
3,[100094].0,Street,0.0,0,98,31020.0,30,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,"POLYGON ((13.46855 52.61490, 13.46841 52.61475..."
4,[1000].0,Street,0.0,0,130,308.0,50,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,"POLYGON ((13.35533 52.51693, 13.35655 52.51683..."


In [3]:
gdf.columns

Index(['id', 'type', 'score', 'incidents', 'rides', 'index_right', 'maxspeed',
       'asphalt', 'concrete', 'paving_stone', 'sett', 'unpaved', 'markers',
       'highway', 'cycleway', 'footway', 'highway_rare', 'living_street',
       'path', 'primary', 'residential', 'secondary', 'tertiary', 'track',
       'geometry'],
      dtype='object')

#### Entfernen nicht benötigter Spalten (Columns)

In [11]:
df = gdf.drop(columns=['id', 'index_right', 'geometry', 'markers', 'highway', 'incidents', 'rides'])

df.head()

Unnamed: 0,type,score,maxspeed,asphalt,concrete,paving_stone,sett,unpaved,cycleway,footway,highway_rare,living_street,path,primary,residential,secondary,tertiary,track
0,Street,0.0,50,0.0,1.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0
1,Junction,0.0,30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Junction,0.0,50,0.846154,0.0,0.0,0.0,0.153846,0.384615,0.0,0.0,0.0,0.0,0.0,0.153846,0.461538,0.0,0.0
3,Street,0.0,30,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Street,0.0,50,1.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0


#### 'type' (STREET und JUNCTION) in numerische Werte (boolsche Werte) umwandeln

In [12]:
df = pd.get_dummies(df, columns=['type'])

df.head()

Unnamed: 0,score,maxspeed,asphalt,concrete,paving_stone,sett,unpaved,cycleway,footway,highway_rare,living_street,path,primary,residential,secondary,tertiary,track,type_Junction,type_Street
0,0.0,50,0.0,1.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,False,True
1,0.0,30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,True,False
2,0.0,50,0.846154,0.0,0.0,0.0,0.153846,0.384615,0.0,0.0,0.0,0.0,0.0,0.153846,0.461538,0.0,0.0,True,False
3,0.0,30,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,False,True
4,0.0,50,1.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,False,True


#### Features und Label auswählen - Zielvariable festlegen

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Features und Label auswählen
# X = df[['rides', 'maxspeed', 'asphalt', 'concrete',
#       'paving_stone', 'sett', 'unpaved', 'cycleway', 'footway',
#      'highway_rare', 'living_street', 'path', 'primary', 'residential',
#       'secondary', 'service', 'tertiary', 'track', 'type_Junction',
#      'type_Street']]

# Zielvariable 
y = df['score']

# Alle anderen Spalten als Features verwenden 
X = df.drop(columns=['score'])

#### Aufteilen der Daten in Trainings- und Testdaten - Ausgabe der Verteilung

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Anzahl der Trainingsdaten: {X_train.shape[0]}")
print(f"Anzahl der Testdaten: {X_test.shape[0]}")


Anzahl der Trainingsdaten: 12434
Anzahl der Testdaten: 3109


#### Preprocessing: (Kategorische Variablen encodieren und) numerische Variablen skalieren

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['maxspeed', 'service',  'asphalt', 'concrete', 
                                   'paving_stone', 'sett', 'unpaved', 
                                   'cycleway', 'footway', 'highway_rare', 
                                   'living_street', 'path', 'primary', 
                                   'residential', 'secondary', 'service', 
                                   'tertiary', 'track', 'type_Junction', 
                                   'type_Street'])
    ])

#### k-NN Modell und Pipeline erstellen

In [26]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('knn', KNeighborsRegressor(n_neighbors=5))])

#### Modell trainieren

In [27]:
pipeline.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe

#### Vorhersagen treffen

In [14]:
y_pred = pipeline.predict(X_test)

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

#### Modell evaluieren

In [63]:
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 0.0025


In [72]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

r2 = r2_score(y_test, y_pred)

print(f"R² Score: {r2:.4f}")

R² Score: -0.1213


In [73]:
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.4f}")

Mean Absolute Error: 0.0158
