In [None]:

import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from datetime import datetime
import math

In [None]:
# List of file paths for each month
file_paths = [
   'new_cleaned/cleaned_november_without.parquet',
   'new_cleaned/cleaned_december_without.parquet',
   'new_cleaned/cleaned_January_without.parquet',
   'new_cleaned/cleaned_october_without.parquet',
   'new_cleaned/cleaned_february_without.parquet',
   'new_cleaned/cleaned_march_without.parquet',
   'new_cleaned/cleaned_april_without.parquet'
    # Add paths for all the months you have
]

# Load all runs data into a single GeoDataFrame
runs_list = [gpd.read_parquet(file_path) for file_path in file_paths]
runs = pd.concat(runs_list, ignore_index=True)

# Load the stop lines data
stop_lines = gpd.read_parquet('Data/stop_lines_cut.parquet')
# Print the total number of rows
print("Total number of rows:", len(runs))

# Ensure CRS matches, if not reproject
if stop_lines.crs != runs.crs:
    stop_lines = stop_lines.to_crs(runs.crs)

# Extract the first two unique runs to create the route
first_two_runs_ids = runs['run'].unique()[:10]
first_two_runs = runs[runs['run'].isin(first_two_runs_ids)].sort_values(by=['run', 'utcTime'])

# Create a LineString object representing the entire route from the first two runs
route_points = first_two_runs.geometry.tolist()
route_line = LineString(route_points)

# Convert the LineString to a GeoDataFrame
route_line_gdf = gpd.GeoDataFrame(
    {'geometry': [route_line]},
    crs=runs.crs
)

# Define the stop line to work with
stop_line_name = 'GoethestraÃŸe'  # Replace with the actual stop line name
stop_line = stop_lines[stop_lines['Stop Name'] == stop_line_name].iloc[0]
stop_line_point = stop_line.geometry

bus_stops = [
    Point(679586.1783256574, 5405375.354150176),
    Point(679448.5525315781, 5405140.401914258)
]

In [None]:
# Function to calculate distance between two points on the bus route
def calculate_route_distance(route_line, point1, point2):
   
    try:
        # Project points onto the route line
        projection1 = route_line.project(point1)
        projection2 = route_line.project(point2)
        
        if np.isnan(projection1) or np.isnan(projection2):
            print(f"Invalid projection values: projection1={projection1}, projection2={projection2}, point1={point1}, point2={point2}")
            return None

        # Ensure the projections are within bounds
        projection1 = max(0, min(projection1, route_line.length))
        projection2 = max(0, min(projection2, route_line.length))

        projected_point1 = route_line.interpolate(projection1)
        projected_point2 = route_line.interpolate(projection2)
        
        # Check if projected points are valid
        if not isinstance(projected_point1, Point) or not isinstance(projected_point2, Point):
            print(f"Invalid projected points: projected_point1={projected_point1}, projected_point2={projected_point2}, point1={point1}, point2={point2}")
            return None
        
        # Create a LineString between the two projected points
        segment = LineString([projected_point1, projected_point2])
        
        # Calculate the distance
        distance = segment.length
        return distance
    except Exception as e:
        print(f"Error calculating route distance: {e}, point1={point1}, point2={point2}")
        return None



In [None]:
# Prepare the dataset for models
data = []

# Iterate through each run
for run_id, run in runs.groupby('run'):
    run = run.sort_values(by='utcTime').reset_index(drop=True)
    # Check if the run duration is less than 3 minutes
    run_start_time = run.loc[0, 'utcTime']
    run_end_time = run.loc[len(run) - 1, 'utcTime']
    run_duration = (run_end_time - run_start_time).total_seconds()
    if run_duration >= 1.1 * 60:
        continue  # Skip this run if duration is more than or equal to 3 minutes
    
    nearest_stop_point, _ = nearest_points(run.geometry.unary_union, stop_line_point)
    nearest_stop_idx = run.index[run.geometry == nearest_stop_point][0]
    nearest_stop_time = run.loc[nearest_stop_idx, 'utcTime']
    
    for i, row in run.iterrows():
        current_point = row.geometry
        current_time = row.utcTime
        
        if i >= nearest_stop_idx:
            continue
        
        distance = calculate_route_distance(route_line_gdf.geometry.iloc[0], current_point, stop_line_point)
        
        if distance is None:
            continue
        
        time_difference = (nearest_stop_time - current_time).total_seconds()
        if time_difference < 0:
            continue
        
        day_of_week = current_time.weekday()
        time_of_day = current_time.hour
        month_of_year = current_time.month
        year_week = current_time.strftime('%Y-%U')
        
        data.append([run_id, distance, day_of_week, time_of_day, month_of_year, time_difference, year_week])
        #print([run_id, distance, day_of_week, time_of_day, month_of_year, time_difference, year_week])

print(len(data))

# Convert the data to a DataFrame
df = pd.DataFrame(data, columns=['run', 'distance', 'day_of_week', 'time_of_day', 'month_of_year', 'time_difference', 'year_week'])


In [None]:
# Get unique weeks
unique_weeks = df['year_week'].unique()

# Split weeks into training and testing sets (80% training, 20% testing)
train_weeks, test_weeks = train_test_split(unique_weeks, test_size=0.2, random_state=42)

# Assign data to training and testing sets based on the week split
train_df = df[df['year_week'].isin(train_weeks)]
test_df = df[df['year_week'].isin(test_weeks)]

# Prepare features (X) and target (y) for training and testing sets
X_train = train_df[['distance', 'day_of_week', 'time_of_day', 'month_of_year']]
y_train = train_df['time_difference']
X_test = test_df[['distance', 'day_of_week', 'time_of_day', 'month_of_year']]
y_test = test_df['time_difference']

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Evaluate the linear model
y_pred_linear = linear_model.predict(X_test_scaled)
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print(f'Linear Regression Model - RMSE: {rmse_linear}, R-squared: {r2_linear}')


In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Adding a constant for statsmodels
X_train_sm = sm.add_constant(X_train_scaled)

# Calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = ['const'] + X_train.columns.tolist()
vif_data["VIF"] = [variance_inflation_factor(X_train_sm, i) for i in range(X_train_sm.shape[1])]

print(vif_data)

# Get the coefficients and intercept
coefficients = linear_model.coef_
intercept = linear_model.intercept_

print("Intercept:", intercept)
print("Coefficients:", coefficients)


In [None]:
import matplotlib.pyplot as plt

# Plotting the actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_linear, alpha=0.01)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title('Linear Regression Model: Actual vs Predicted')
plt.xlabel('Actual Time Difference')
plt.ylabel('Predicted Time Difference')
plt.show()

# Plotting the residuals
residuals = y_test - y_pred_linear

plt.figure(figsize=(10, 6))
plt.scatter(y_pred_linear, residuals, alpha=0.01)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Linear Regression Model: Residuals')
plt.xlabel('Predicted Time Difference')
plt.ylabel('Residuals')
plt.show()

In [None]:
# Function to predict time difference using the trained Linear Regression model
def predict_time_difference(current_location, current_time):
    if isinstance(current_location, tuple):
        current_location = Point(current_location)
    
    distance = calculate_route_distance(route_line_gdf.geometry.iloc[0], current_location, stop_line_point)

    day_of_week = current_time.weekday()
    time_of_day = current_time.hour + current_time.minute / 60.0
    month_of_year = current_time.month
    
    features = pd.DataFrame([[distance, day_of_week, time_of_day, month_of_year]], columns=['distance', 'day_of_week', 'time_of_day', 'month_of_year'])
    predicted_time_difference = linear_model.predict(features)
    
    return predicted_time_difference[0]


current_location = (679646.0070022508, 5405541.164896245)  
current_time = datetime(2024, 6, 10, 10, 35, 45)
print(current_time)
predicted_time = predict_time_difference(current_location, current_time)
print(f'Predicted Time Difference in Seconds: {predicted_time}')