In [None]:

import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from datetime import datetime
%load_ext tensorboard

In [None]:
# List of file paths for each month
file_paths = [
   'new_cleaned/cleaned_november_without.parquet',
   'new_cleaned/cleaned_december_without.parquet',
   'new_cleaned/cleaned_January_without.parquet',
   'new_cleaned/cleaned_october_without.parquet',
   'new_cleaned/cleaned_february_without.parquet',
   'new_cleaned/cleaned_march_without.parquet',
   'new_cleaned/cleaned_april_without.parquet'
    # Add paths for all the months you have
]

# Load all runs data into a single GeoDataFrame
runs_list = [gpd.read_parquet(file_path) for file_path in file_paths]
runs = pd.concat(runs_list, ignore_index=True)

# Load the stop lines data
stop_lines = gpd.read_parquet('Data/stop_lines_cut.parquet')
bus_stops = gpd.read_parquet('Data/bus_stops.parquet')


# Print the total number of rows
print("Total number of rows:", len(runs))

# Ensure CRS matches, if not reproject
if stop_lines.crs != runs.crs:
    stop_lines = stop_lines.to_crs(runs.crs)

# Extract the first two unique runs to create the route
first_two_runs_ids = runs['run'].unique()[:5]
first_two_runs = runs[runs['run'].isin(first_two_runs_ids)].sort_values(by=['run', 'utcTime'])

# Create a LineString object representing the entire route from the first two runs
route_points = first_two_runs.geometry.tolist()
route_line = LineString(route_points)

# Convert the LineString to a GeoDataFrame
route_line_gdf = gpd.GeoDataFrame(
    {'geometry': [route_line]},
    crs=runs.crs
)

# Define the stop line to work with
stop_line_name = 'GoethestraÃŸe'  # Replace with the actual stop line name
stop_line = stop_lines[stop_lines['Stop Name'] == stop_line_name].iloc[0]
stop_line_point = stop_line.geometry


In [None]:
# Function to calculate distance between two points on the bus route
def calculate_route_distance(route_line, point1, point2):
   
    try:
        # Project points onto the route line
        projection1 = route_line.project(point1)
        projection2 = route_line.project(point2)
        
        if np.isnan(projection1) or np.isnan(projection2):
            print(f"Invalid projection values: projection1={projection1}, projection2={projection2}, point1={point1}, point2={point2}")
            return None

        # Ensure the projections are within bounds
        projection1 = max(0, min(projection1, route_line.length))
        projection2 = max(0, min(projection2, route_line.length))

        projected_point1 = route_line.interpolate(projection1)
        projected_point2 = route_line.interpolate(projection2)
        
        # Check if projected points are valid
        if not isinstance(projected_point1, Point) or not isinstance(projected_point2, Point):
            print(f"Invalid projected points: projected_point1={projected_point1}, projected_point2={projected_point2}, point1={point1}, point2={point2}")
            return None
        
        # Create a LineString between the two projected points
        segment = LineString([projected_point1, projected_point2])
        
        # Calculate the distance
        distance = segment.length
        return distance
    except Exception as e:
        print(f"Error calculating route distance: {e}, point1={point1}, point2={point2}")
        return None

In [None]:
# Prepare the dataset for models
data = []

# Iterate through each run
for run_id, run in runs.groupby('run'):
    run = run.sort_values(by='utcTime').reset_index(drop=True)
    
    # Check if the run duration is less than 3 minutes
    run_start_time = run.loc[0, 'utcTime']
    run_end_time = run.loc[len(run) - 1, 'utcTime']
    run_duration = (run_end_time - run_start_time).total_seconds()
    if run_duration >= 1.86 * 60:
        continue  # Skip this run if duration is more than or equal to 3 minutes
    

    nearest_stop_point, _ = nearest_points(run.geometry.unary_union, stop_line_point)
    nearest_stop_idx = run.index[run.geometry == nearest_stop_point][0]
    nearest_stop_time = run.loc[nearest_stop_idx, 'utcTime']
    
    for i, row in run.iterrows():
        current_point = row.geometry
        current_time = row.utcTime
        
        if i >= nearest_stop_idx:
            continue
        
        distance = calculate_route_distance(route_line_gdf.geometry.iloc[0], current_point, stop_line_point)
        
        if distance is None:
            continue
        
        time_difference = (nearest_stop_time - current_time).total_seconds()
        if time_difference < 0:
            continue
        
        day_of_week = current_time.weekday()
        time_of_day = current_time.hour
        month_of_year = current_time.month
        year_week = current_time.strftime('%Y-%U')

        data.append([run_id, distance, day_of_week, time_of_day, month_of_year, time_difference, year_week])

print(len(data))
# Convert the data to a DataFrame
df = pd.DataFrame(data, columns=['run', 'distance', 'day_of_week', 'time_of_day', 'month_of_year', 'time_difference', 'year_week'])

In [None]:
# Get unique weeks
unique_weeks = df['year_week'].unique()

# Split weeks into training and testing sets (80% training, 20% testing)
train_weeks, test_weeks = train_test_split(unique_weeks, test_size=0.2, random_state=42)

# Assign data to training and testing sets based on the week split
train_df = df[df['year_week'].isin(train_weeks)]
test_df = df[df['year_week'].isin(test_weeks)]

# Prepare features (X) and target (y) for training and testing sets
X_train = train_df[['distance', 'day_of_week', 'time_of_day', 'month_of_year']]
y_train = train_df['time_difference']
X_test = test_df[['distance', 'day_of_week', 'time_of_day', 'month_of_year']]
y_test = test_df['time_difference']

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape X for LSTM (samples, time steps, features)
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Define the LSTM model
model = Sequential()
model.add(LSTM(16, activation='tanh', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dropout(0.1))
model.add(Dense(8, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

log_dir = "logs/fit/LSTM_without_" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)



# Train the LSTM model
history = model.fit(X_train_reshaped, y_train, epochs=20, batch_size=64, validation_data=(X_test_reshaped, y_test), verbose=1, callbacks=[tensorboard_callback])

# Evaluate the model
y_pred_lstm = model.predict(X_test_reshaped)
mse_lstm = mean_squared_error(y_test, y_pred_lstm)
rmse_lstm = np.sqrt(mse_lstm)
r2_lstm = r2_score(y_test, y_pred_lstm)

print(f'LSTM Model - RMSE: {rmse_lstm}, R-squared: {r2_lstm}')

In [None]:
%tensorboard --logdir logs/fit

In [None]:
import matplotlib.pyplot as plt




print(f'MLP Model - RMSE: {rmse_lstm}, R-squared: {r2_lstm}')

# Get training and validation loss
training_loss = history.history['loss']
validation_loss = history.history['val_loss']
training_mae = history.history['mae']
validation_mae = history.history['val_mae']

print(f'MLP Model - Final Training Loss: {training_loss[-1]}')
print(f'MLP Model - Final Validation Loss: {validation_loss[-1]}')

# Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(training_loss, label='Training Loss')
plt.plot(validation_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot training and validation MAE
plt.figure(figsize=(12, 6))
plt.plot(training_mae, label='Training MAE')
plt.plot(validation_mae, label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Training and Validation MAE')
plt.legend()
plt.show()

# Plot predicted vs actual values
plt.figure(figsize=(12, 6))
plt.scatter(y_test, y_pred_lstm, alpha=0.01)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Diagonal line for reference
plt.show()

In [None]:
# Function to predict time difference using the trained LSTM model
def predict_time_difference_lstm(current_location, current_time):
    if isinstance(current_location, tuple):
        current_location = Point(current_location)
    
    distance = calculate_route_distance(route_line_gdf.geometry.iloc[0], current_location, stop_line_point)

    day_of_week = current_time.weekday()
    time_of_day = current_time.hour + current_time.minute / 60.0
    month_of_year = current_time.month
    
    
    features = pd.DataFrame([[distance, day_of_week, time_of_day, month_of_year]], columns=['distance', 'day_of_week', 'time_of_day', 'month_of_year'])
    features_scaled = scaler.transform(features)
    features_reshaped = features_scaled.reshape((features_scaled.shape[0], 1, features_scaled.shape[1]))
    predicted_time_difference = model.predict(features_reshaped)
    
    if predicted_time_difference.ndim == 1:
        return predicted_time_difference[0]
    return predicted_time_difference.item()

current_location = (679646.0070022508, 5405541.164896245)  # Example UTM coordinates (meters)
current_time = datetime(2024, 6, 10, 10, 35, 45)
print(current_time)
predicted_time = predict_time_difference_lstm(current_location, current_time)
print(f'Predicted Time Difference in Seconds: {predicted_time}')