In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import hdbscan
import os

# Load and preprocess data
csv_dir = "data_chicago_hackathon_2024/probe_data/11"
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]
dataframes = [pd.read_csv(os.path.join(csv_dir, f)) for f in csv_files[:2]]
probe_data = pd.concat(dataframes, ignore_index=True)
roundabout_data = pd.read_csv('data_chicago_hackathon_2024/hamburg_extra_layers/hamburg_rounsabouts.csv')

print(f"Initial probe data shape: {probe_data.shape}")

# Filter out pedestrians (assuming speed > 0 indicates vehicles)
probe_data = probe_data[probe_data['speed'] > 0]

print(f"Probe data shape after filtering pedestrians: {probe_data.shape}")

# Feature engineering
def calculate_heading_change(group):
    return np.abs(np.diff(group['heading'])).mean() if len(group) > 1 else 0

def calculate_curvature(group):
    if len(group) < 3:
        return 0
    x = group['longitude'].values
    y = group['latitude'].values
    dx = np.gradient(x)
    dy = np.gradient(y)
    ds = np.sqrt(dx*dx + dy*dy)
    d2x = np.gradient(dx, ds, edge_order=1)
    d2y = np.gradient(dy, ds, edge_order=1)
    curvature = np.abs(dx * d2y - dy * d2x) / (dx * dx + dy * dy)**1.5
    return np.mean(curvature)

# Group by traceid and calculate features
grouped = probe_data.groupby('traceid')
features = grouped.agg({
    'latitude': 'mean',
    'longitude': 'mean',
    'speed': 'mean',
}).reset_index()

features['heading_change'] = grouped.apply(calculate_heading_change)
features['curvature'] = grouped.apply(calculate_curvature)

# Add additional features
features['point_count'] = grouped.size()
features['distance'] = grouped.apply(lambda g: np.sum(np.sqrt(np.diff(g['longitude'])**2 + np.diff(g['latitude'])**2)))

print(f"Features shape: {features.shape}")
print("Features description:")
print(features.describe())

# Remove any rows with NaN values
features = features.dropna()
print(f"Features shape after dropping NaN: {features.shape}")

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features.drop(['traceid', 'latitude', 'longitude'], axis=1))

print(f"Scaled features shape: {scaled_features.shape}")

# Apply HDBSCAN only if we have data
if scaled_features.shape[0] > 0:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3)
    clusterer.fit(scaled_features)

    # Add cluster labels to features
    features['cluster'] = clusterer.labels_

    # Label data points as roundabouts or not
    def is_roundabout(lat, lon):
        return any((roundabout_data['latitude'] - lat)**2 + (roundabout_data['longitude'] - lon)**2 < 1e-6)

    features['is_roundabout'] = features.apply(lambda row: is_roundabout(row['latitude'], row['longitude']), axis=1)

    # Prepare data for classification
    X = features.drop(['traceid', 'latitude', 'longitude', 'is_roundabout'], axis=1)
    y = features['is_roundabout']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Evaluate model
    y_pred = rf_model.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Feature importance
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
    print(feature_importance.sort_values('importance', ascending=False))
else:
    print("No data available after preprocessing. Please check your input data and preprocessing steps.")

Initial probe data shape: (244911, 7)
Probe data shape after filtering pedestrians: (242640, 7)


  features['heading_change'] = grouped.apply(calculate_heading_change)
  a = -(dx2)/(dx1 * (dx1 + dx2))
  a = -(dx2)/(dx1 * (dx1 + dx2))
  b = (dx2 - dx1) / (dx1 * dx2)
  b = (dx2 - dx1) / (dx1 * dx2)
  c = dx1 / (dx2 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
  curvature = np.abs(dx * d2y - dy * d2x) / (dx * dx + dy * dy)**1.5
  out[tuple(slice1)] = (f[tuple(slice4)] - f[tuple(slice2)]) / (2. * ax_dx)
  out[tuple(slice1)] = (f[tuple(slice2)] - f[tuple(slice3)]) / dx_0
  out[tuple(slice1)] = (f[tuple(slice2)] - f[tuple(slice3)]) / dx_n
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
  curvature = np.abs(dx * d2y - dy * d2x) / (dx * dx + dy * dy)**1.5
  curvature = np.abs(dx * d2y - dy * d2x) / (dx * dx + dy * dy)**1.5
  features['curvature'] = grouped.apply(calculate_curvature)
  features['distance'] = grouped.apply(lambda g: np.sum(np.sqrt(np.diff(g['

Features shape: (2576, 8)
Features description:
          latitude    longitude        speed  heading_change  curvature  \
count  2576.000000  2576.000000  2576.000000             0.0        0.0   
mean     53.607635    10.187822    39.321676             NaN        NaN   
std       0.001807     0.003314    13.138890             NaN        NaN   
min      53.601769    10.180000     1.000000             NaN        NaN   
25%      53.607400    10.186763    33.327778             NaN        NaN   
50%      53.608333    10.188245    41.013393             NaN        NaN   
75%      53.608634    10.189425    47.161659             NaN        NaN   
max      53.612275    10.197922    92.000000             NaN        NaN   

       point_count  distance  
count          0.0       0.0  
mean           NaN       NaN  
std            NaN       NaN  
min            NaN       NaN  
25%            NaN       NaN  
50%            NaN       NaN  
75%            NaN       NaN  
max            NaN       NaN

ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required by StandardScaler.