In [6]:
import pandas as pd
import osmnx as ox
import networkx as nx
import requests
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import json
import folium

In [7]:
file_path = 'nyc_data.csv'
df = pd.read_csv(file_path)

In [8]:
df.fillna({
    'WindSpeed(mph)': 0,
    'Precipitation(in)': 0,
    'Visibility(mi)': df['Visibility(mi)'].median(),
    'Temperature(F)': df['Temperature(F)'].median(),
    'Humidity(%)': df['Humidity(%)'].median()
}, inplace=True)

In [9]:
common_conditions = df['Weather_Conditions'].value_counts().index[:10]

df['Weather_Conditions'] = df['Weather_Conditions'].apply(
    lambda x: x if x in common_conditions else 'Other')

df = pd.get_dummies(df, columns=['Weather_Conditions'], drop_first=True)

df = df.loc[:, ~df.columns.duplicated()]
df.columns.tolist()

['Severity',
 'Start_Lat',
 'Start_Lng',
 'Distance(mi)',
 'DelayFromTypicalTraffic(mins)',
 'DelayFromFreeFlowSpeed(mins)',
 'Temperature(F)',
 'Humidity(%)',
 'Visibility(mi)',
 'WindSpeed(mph)',
 'Precipitation(in)',
 'Weather_Conditions_Other',
 'Start_Hour',
 'Start_Minute',
 'Start_Day',
 'Start_Month',
 'Start_DayOfWeek',
 'Duration_min',
 'Weather_Conditions_Cloudy',
 'Weather_Conditions_Fair',
 'Weather_Conditions_Haze',
 'Weather_Conditions_Light Rain',
 'Weather_Conditions_Mostly Cloudy',
 'Weather_Conditions_Overcast',
 'Weather_Conditions_Partly Cloudy',
 'Weather_Conditions_Rain',
 'Weather_Conditions_Scattered Clouds']

In [10]:
X = df.drop(columns=['DelayFromTypicalTraffic(mins)'])
y = df['DelayFromTypicalTraffic(mins)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_set = lgb.Dataset(X_train, label=y_train)
valid_set = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'regression',
    'metric': ['rmse', 'mae'],
    'learning_rate': 0.05,
    'num_leaves': 63,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': -1,
    'seed': 42,
    'max_depth': -1
}

model = lgb.train(
    params,
    train_set=train_set,
    valid_sets=[train_set, valid_set],
    valid_names=['train', 'valid'],
    num_boost_round=1000
)

y_pred = model.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")

RMSE: 0.70
R²: 0.8889


In [11]:
def fetch_current_weather_nyc():
    lat, lon = 40.7128, -74.0060

    points_data = requests.get(f'https://api.weather.gov/points/{lat},{lon}').json()
    forecast = requests.get(points_data['properties']['forecastHourly']).json()
    hour = forecast['properties']['periods'][0]

    stations = requests.get(points_data['properties']['observationStations']).json()
    obs = requests.get(f"{stations['features'][0]['id']}/observations/latest").json()['properties']

    visibility = obs['visibility']['value']
    precip_mm = obs.get('precipitationLastHour', {}).get('value', 0)

    return {
        'Temperature(F)': hour['temperature'],
        'WindSpeed(mph)': int(hour['windSpeed'].split()[0]) if hour['windSpeed'] else 0,
        'Weather_Conditions': hour['shortForecast'],
        'Humidity(%)': obs['relativeHumidity']['value'],
        'Visibility(mi)': round(visibility/1609.34, 2) if visibility else df['Visibility(mi)'].median(),
        'Precipitation(in)': round(precip_mm/25.4, 2) if precip_mm else 0
    }
current_weather = fetch_current_weather_nyc()
print("NYC Current Weather:", current_weather)

NYC Current Weather: {'Temperature(F)': 47, 'WindSpeed(mph)': 14, 'Weather_Conditions': 'Chance Rain Showers', 'Humidity(%)': 29.962613333049, 'Visibility(mi)': 10.0, 'Precipitation(in)': 0}


In [12]:
G = ox.graph_from_place("New York City, NY, USA", network_type='drive')

In [18]:
def calculate_edge_weights(G, model, current_weather, X_columns):
    now = datetime.now()
    current_condition = current_weather['Weather_Conditions'].replace(" ", "")
    edges_data = []
    for u, v, key, data in G.edges(keys=True, data=True):
        edges_data.append({
            'u': u,
            'v': v,
            'key': key,
            'y': data.get('y', 40.7128),
            'x': data.get('x', -74.0060),
            'length': data.get('length', 100)
        })

    edges_df = pd.DataFrame(edges_data)

    features_df = pd.DataFrame({
        'Severity': 2,
        'Start_Lat': edges_df['y'],
        'Start_Lng': edges_df['x'],
        'Distance(mi)': edges_df['length']/1609.34,
        'Temperature(F)': current_weather['Temperature(F)'],
        'Humidity(%)': current_weather['Humidity(%)'],
        'Visibility(mi)': current_weather['Visibility(mi)'],
        'WindSpeed(mph)': current_weather['WindSpeed(mph)'],
        'Precipitation(in)': current_weather['Precipitation(in)'],
        'Start_Hour': now.hour,
        'Start_Minute': now.minute,
        'Start_Day': now.day,
        'Start_Month': now.month,
        'Start_DayOfWeek': now.weekday(),
        'Duration_min': 10,
        'DelayFromFreeFlowSpeed(mins)': 0
    })
    weather_cols = [col for col in X_columns if col.startswith('Weather_Conditions_')]
    for col in weather_cols:
        features_df[col] = 0

    col_condition = next((col for col in weather_cols
                         if col.replace('Weather_Conditions_', '').replace(" ", "") in current_condition),
                        'Weather_Conditions_Other')
    features_df[col_condition] = 1
    features_df = features_df[X_columns]
    predicted_delays = model.predict(features_df)

    for (u, v, key), delay in zip(edges_df[['u', 'v', 'key']].itertuples(index=False), predicted_delays):
        G.edges[u, v, key]['weight'] = max(delay, 0.1)

calculate_edge_weights(G, model, current_weather, X.columns)

In [35]:
def find_optimal_route(start, end, G):
    start_coords = ox.geocode(start)
    end_coords = ox.geocode(end)
    start_node = ox.nearest_nodes(G, start_coords[1], start_coords[0])
    end_node = ox.nearest_nodes(G, end_coords[1], end_coords[0])

    for u, v, key, data in G.edges(keys=True, data=True):
        if 'weight' not in data or data['weight'] < 0:
            data['weight'] = max(data.get('weight', 0), 0.1)

    shortest = nx.shortest_path(G, start_node, end_node, weight='length')
    optimized = nx.shortest_path(G, start_node, end_node, weight='weight')

    def get_stats(path):
        edges = [G.edges[u, v, 0] for u, v in zip(path[:-1], path[1:])]
        miles = sum(e['length'] for e in edges) / 1609.34
        speeds = []
        for e in edges:
            if 'highway' in e:
                road_type = e['highway']
                if isinstance(road_type, list):
                    road_type = road_type[0]
                speed_map = {
                    'motorway': 55,
                    'trunk': 45,
                    'primary': 35,
                    'secondary': 30,
                    'tertiary': 25,
                    'residential': 20,
                    'unclassified': 20
                }
                speeds.append(speed_map.get(road_type, 25))
            else:
                speeds.append(30)

        avg_speed = sum(speeds) / len(speeds)
        delay = sum(e['weight'] for e in edges)
        base_time = (miles / avg_speed) * 60
        total_time = base_time + delay
        return round(miles, 2), round(delay, 2), round(total_time, 2), path

    s_miles, s_delay, s_time, _ = get_stats(shortest)
    o_miles, o_delay, o_time, _ = get_stats(optimized)

    return {
        'route': f"{start} to {end}",
        'shortest': {'miles': s_miles, 'delay': s_delay, 'time': s_time, 'path': shortest},
        'optimized': {'miles': o_miles, 'delay': o_delay, 'time': o_time, 'path': optimized},
        'time_saved': round(s_time - o_time, 2)
    }

test_routes = [
    ("Empire State Building, NYC", "Times Square, NYC"),
    ("Brooklyn Bridge, NYC", "Central Park, NYC"),
    ("JFK Airport, NYC", "Statue of Liberty, NYC")
]

all_results = []
for start, end in test_routes:
    result = find_optimal_route(start, end, G)
    if result:
        all_results.append(result)
        print(f"\n{result['route']}:")
        print(f"Shortest: {result['shortest']['miles']} miles, {result['shortest']['time']} mins")
        print(f"Optimized: {result['optimized']['miles']} miles, {result['optimized']['time']} mins")
        print(f"Time saved: {result['time_saved']} mins")

output = {
    'weather': fetch_current_weather_nyc(),
    'routes': all_results,
    'model_stats': {
        'RMSE': mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }
}

with open('route_results.json', 'w') as f:
    json.dump(output, f, indent=2)


Empire State Building, NYC to Times Square, NYC:
Shortest: 1.01 miles, 3.57 mins
Optimized: 1.01 miles, 3.57 mins
Time saved: 0.0 mins

Brooklyn Bridge, NYC to Central Park, NYC:
Shortest: 5.86 miles, 22.75 mins
Optimized: 8.45 miles, 18.84 mins
Time saved: 3.91 mins

JFK Airport, NYC to Statue of Liberty, NYC:
Shortest: 15.33 miles, 48.81 mins
Optimized: 21.72 miles, 45.43 mins
Time saved: 3.38 mins
