In [None]:
#Average duration and number of runs
import pandas as pd
import geopandas as gpd
import plotly.express as px
from pyproj import Transformer
from shapely import wkt
import plotly.graph_objects as go

file_paths = [
    'new_cleaned/november_traffic.parquet',
    'new_cleaned/october_traffic.parquet',
    'new_cleaned/december_traffic.parquet',
    'new_cleaned/january_traffic.parquet',
    'new_cleaned/february_traffic.parquet',
    'new_cleaned/march_traffic.parquet',
    'new_cleaned/april_traffic.parquet'
    # Add paths for all the months you have
]

# Load all runs data into a single GeoDataFrame
runs_list = [gpd.read_parquet(file_path) for file_path in file_paths]
runs = pd.concat(runs_list, ignore_index=True)

# Reset index to ensure clean, sequential indices before processing
runs.reset_index()

# Calculate the total number of unique runs in the GeoDataFrame
total_runs = runs['run'].nunique()

print(f'Total number of unique runs: {total_runs}')

# Convert utcTime to datetime
runs['utcTime'] = pd.to_datetime(runs['utcTime'])

# Check and convert geometry column if necessary
if isinstance(runs['geometry'].iloc[0], str):
    runs['geometry'] = runs['geometry'].apply(wkt.loads)

# Define a transformer from UTM Zone 32N to WGS 84 (latitude and longitude)
transformer = Transformer.from_crs("epsg:32632", "epsg:4326", always_xy=True)

# Apply transformation to convert coordinates
runs['longitude'], runs['latitude'] = transformer.transform(runs['geometry'].x, runs['geometry'].y)

# Group by 'run' and calculate start and end times for each run
run_durations = runs.groupby('run').agg(
    start_time=('utcTime', 'first'),  # Get the first timestamp for each run
    end_time=('utcTime', 'last'),     # Get the last timestamp for each run
    start_latitude=('latitude', 'first'),  # Get the first latitude for each run
    start_longitude=('longitude', 'first')  # Get the first longitude for each run
)

# Calculate the duration of each run in seconds
run_durations['duration'] = (run_durations['end_time'] - run_durations['start_time']).dt.total_seconds()

# Reset the index to make 'run' a column
run_durations.reset_index(inplace=True)

# Extract the hour from the start_time for visualization
run_durations['start_hour'] = run_durations['start_time'].dt.hour

# Calculate the average duration for each hour
avg_duration_per_hour = run_durations.groupby('start_hour')['duration'].mean().reset_index()

# Calculate the number of runs for each hour
num_runs_per_hour = run_durations.groupby('start_hour').size().reset_index(name='num_runs')

# Merge the average duration and number of runs dataframes
merged_hour_data = pd.merge(avg_duration_per_hour, num_runs_per_hour, on='start_hour')

# Extract week and year information using dt.isocalendar()
run_durations['year'] = run_durations['start_time'].dt.isocalendar().year
run_durations['week'] = run_durations['start_time'].dt.isocalendar().week

# Create a new column to uniquely identify each week in each year
run_durations['year_week'] = run_durations['year'].astype(str) + '-W' + run_durations['week'].astype(str)

# Group by the new 'year_week' column
avg_duration_per_week = run_durations.groupby('year_week')['duration'].mean().reset_index()
num_runs_per_week = run_durations.groupby('year_week').size().reset_index(name='num_runs')

# Merge the average duration and number of runs dataframes for weeks
merged_week_data = pd.merge(avg_duration_per_week, num_runs_per_week, on='year_week')

# Extract month and year information
run_durations['year_month'] = run_durations['start_time'].dt.strftime('%Y-%m')

# Group by the new 'year_month' column
avg_duration_per_month = run_durations.groupby('year_month')['duration'].mean().reset_index()
num_runs_per_month = run_durations.groupby('year_month').size().reset_index(name='num_runs')

# Merge the average duration and number of runs dataframes for months
merged_month_data = pd.merge(avg_duration_per_month, num_runs_per_month, on='year_month')

# Extract day of week information
run_durations['day_of_week'] = run_durations['start_time'].dt.dayofweek

# Map day of week number to day name
day_name_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
run_durations['day_name'] = run_durations['day_of_week'].map(day_name_mapping)

# Group by the day of the week
avg_duration_per_day = run_durations.groupby('day_name')['duration'].mean().reset_index()
num_runs_per_day = run_durations.groupby('day_name').size().reset_index(name='num_runs')

# Merge the average duration and number of runs dataframes for days of the week
merged_day_data = pd.merge(avg_duration_per_day, num_runs_per_day, on='day_name')

# Sort by day of week for correct order in the plot
merged_day_data['day_of_week'] = merged_day_data['day_name'].map({v: k for k, v in day_name_mapping.items()})
merged_day_data.sort_values('day_of_week', inplace=True)

# Plot average duration and number of runs per hour
fig_hour = go.Figure()

# Add the average duration line (red)
fig_hour.add_trace(go.Scatter(
    x=merged_hour_data['start_hour'], 
    y=merged_hour_data['duration'], 
    mode='lines+markers+text',
    text=merged_hour_data['num_runs'],
    textposition='top center',
    name='Average Duration (seconds)',
    line=dict(color='red')
))

# Add a note in the corner
fig_hour.add_annotation(
    xref='paper', yref='paper',
    x=0.95, y=0.95,
    text='Number of runs',
    showarrow=False
)

fig_hour.update_layout(
    title='Average Run Duration and Number of Runs Compared to Time of Day',
    xaxis=dict(title='Hour of Day'),
    yaxis=dict(
        title='Average Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Plot average duration and number of runs per week
fig_week = go.Figure()

# Add the average duration line (red)
fig_week.add_trace(go.Scatter(
    x=merged_week_data['year_week'], 
    y=merged_week_data['duration'], 
    mode='lines+markers+text',
    text=merged_week_data['num_runs'],
    textposition='top center',
    name='Average Duration (seconds)',
    line=dict(color='red')
))

# Add a note in the corner
fig_week.add_annotation(
    xref='paper', yref='paper',
    x=0.95, y=0.95,
    text='Number of runs',
    showarrow=False
)

fig_week.update_layout(
    title='Average Run Duration and Number of Runs Per Week',
    xaxis=dict(title='Week', tickmode='linear'),
    yaxis=dict(
        title='Average Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Plot average duration and number of runs per month
fig_month = go.Figure()

# Add the average duration line (red)
fig_month.add_trace(go.Scatter(
    x=merged_month_data['year_month'], 
    y=merged_month_data['duration'], 
    mode='lines+markers+text',
    text=merged_month_data['num_runs'],
    textposition='top center',
    name='Average Duration (seconds)',
    line=dict(color='red')
))

# Add a note in the corner
fig_month.add_annotation(
    xref='paper', yref='paper',
    x=0.95, y=0.95,
    text='Number of runs',
    showarrow=False
)

fig_month.update_layout(
    title='Average Run Duration and Number of Runs Per Month',
    xaxis=dict(title='Month'),
    yaxis=dict(
        title='Average Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Plot average duration and number of runs per day of the week
fig_day = go.Figure()

# Add the average duration line (red)
fig_day.add_trace(go.Scatter(
    x=merged_day_data['day_name'], 
    y=merged_day_data['duration'], 
    mode='lines+markers+text',
    text=merged_day_data['num_runs'],
    textposition='top center',
    name='Average Duration (seconds)',
    line=dict(color='red')
))

# Add a note in the corner
fig_day.add_annotation(
    xref='paper', yref='paper',
    x=0.95, y=0.95,
    text='Number of runs',
    showarrow=False
)

fig_day.update_layout(
    title='Average Run Duration and Number of Runs Per Day of the Week',
    xaxis=dict(title='Day of Week'),
    yaxis=dict(
        title='Average Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Show the plots
fig_hour.show()
fig_week.show()
fig_month.show()
fig_day.show()






In [None]:
# Calculate the average (mean) duration
average_duration = run_durations['duration'].mean()

# Calculate the range of durations (max - min)
range_duration = run_durations['duration'].max() - run_durations['duration'].min()

# Print the results
print(f"Average Duration: {average_duration} seconds")
print(f"Range of Durations: {range_duration} seconds")

In [None]:
import pandas as pd
import geopandas as gpd
import plotly.express as px
from pyproj import Transformer
from shapely import wkt
import plotly.graph_objects as go

file_paths = [
    'Data/gdf_23-11-01_23-12-01.parquet',
    'Data/gdf_23-10-01_23-11-01.parquet',
    'Data/gdf_23-12-01_24-01-01.parquet',
    'Data/gdf_24-01-01_24-02-01.parquet',
    'Data/gdf_24-02-01_24-03-01.parquet',
    'Data/gdf_24-03-01_24-04-01.parquet',
    'Data/gdf_24-04-01_24-04-22.parquet'
    # Add paths for all the months you have
]

# Load all runs data into a single GeoDataFrame
runs_list = [gpd.read_parquet(file_path) for file_path in file_paths]
runs = pd.concat(runs_list)
total_runs = runs.index.get_level_values('run').nunique()

print(f'Total number of unique runs: {total_runs}')







In [None]:
#Distribution plots
import pandas as pd
import geopandas as gpd
import plotly.express as px
from pyproj import Transformer
from shapely import wkt
import plotly.graph_objects as go

file_paths = [
    'new_cleaned/cleaned_november.parquet',
    'new_cleaned/cleaned_october.parquet',
    'new_cleaned/cleaned_december.parquet',
    'new_cleaned/cleaned_january.parquet',
    'new_cleaned/cleaned_february.parquet',
    'new_cleaned/cleaned_march.parquet',
    'new_cleaned/cleaned_april.parquet'
    # Add paths for all the months you have
]

# Load all runs data into a single GeoDataFrame
runs_list = [gpd.read_parquet(file_path) for file_path in file_paths]
runs = pd.concat(runs_list, ignore_index=True)

# Reset index to ensure clean, sequential indices before processing
runs.reset_index(drop=True, inplace=True)

# Calculate the total number of unique runs in the GeoDataFrame
total_runs = runs['run'].nunique()

print(f'Total number of unique runs: {total_runs}')

# Convert utcTime to datetime
runs['utcTime'] = pd.to_datetime(runs['utcTime'])

# Check and convert geometry column if necessary
if isinstance(runs['geometry'].iloc[0], str):
    runs['geometry'] = runs['geometry'].apply(wkt.loads)

# Define a transformer from UTM Zone 32N to WGS 84 (latitude and longitude)
transformer = Transformer.from_crs("epsg:32632", "epsg:4326", always_xy=True)

# Apply transformation to convert coordinates
runs['longitude'], runs['latitude'] = transformer.transform(runs['geometry'].x, runs['geometry'].y)

# Group by 'run' and calculate start and end times for each run
run_durations = runs.groupby('run').agg(
    start_time=('utcTime', 'first'),  # Get the first timestamp for each run
    end_time=('utcTime', 'last'),     # Get the last timestamp for each run
    start_latitude=('latitude', 'first'),  # Get the first latitude for each run
    start_longitude=('longitude', 'first')  # Get the first longitude for each run
)

# Calculate the duration of each run in seconds
run_durations['duration'] = (run_durations['end_time'] - run_durations['start_time']).dt.total_seconds()

# Reset the index to make 'run' a column
run_durations.reset_index(inplace=True)

# Extract the hour from the start_time for visualization
run_durations['start_hour'] = run_durations['start_time'].dt.hour

# Extract the day of the week from the start_time for visualization
run_durations['start_day'] = run_durations['start_time'].dt.dayofweek

# Extract week and year information using dt.isocalendar()
run_durations['year'] = run_durations['start_time'].dt.isocalendar().year
run_durations['week'] = run_durations['start_time'].dt.isocalendar().week

# Create a new column to uniquely identify each week in each year
run_durations['year_week'] = run_durations['year'].astype(str) + '-W' + run_durations['week'].astype(str)

# Extract month and year information
run_durations['year_month'] = run_durations['start_time'].dt.strftime('%Y-%m')

# Day of week mapping
day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
run_durations['start_day_name'] = run_durations['start_day'].map(day_names)

# Plot duration distribution per hour
fig_hour_dist = go.Figure()

# Add the duration points (scatter)
fig_hour_dist.add_trace(go.Scatter(
    x=run_durations['start_hour'], 
    y=run_durations['duration'], 
    mode='markers',
    name='Duration (seconds)',
    marker=dict(color='blue', opacity=0.5)
))

fig_hour_dist.update_layout(
    title='Run Duration Distribution by Hour of Day',
    xaxis=dict(title='Hour of Day'),
    yaxis=dict(
        title='Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Plot duration distribution per week
fig_week_dist = go.Figure()

# Add the duration points (scatter)
fig_week_dist.add_trace(go.Scatter(
    x=run_durations['year_week'], 
    y=run_durations['duration'], 
    mode='markers',
    name='Duration (seconds)',
    marker=dict(color='blue', opacity=0.5)
))

fig_week_dist.update_layout(
    title='Run Duration Distribution by Week',
    xaxis=dict(title='Week', tickmode='linear'),
    yaxis=dict(
        title='Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Plot duration distribution per month
fig_month_dist = go.Figure()

# Add the duration points (scatter)
fig_month_dist.add_trace(go.Scatter(
    x=run_durations['year_month'], 
    y=run_durations['duration'], 
    mode='markers',
    name='Duration (seconds)',
    marker=dict(color='blue', opacity=0.5)
))

fig_month_dist.update_layout(
    title='Run Duration Distribution by Month',
    xaxis=dict(title='Month'),
    yaxis=dict(
        title='Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Plot duration distribution per day of the week
fig_day_dist = go.Figure()

# Add the duration points (scatter)
fig_day_dist.add_trace(go.Scatter(
    x=run_durations['start_day_name'], 
    y=run_durations['duration'], 
    mode='markers',
    name='Duration (seconds)',
    marker=dict(color='blue', opacity=0.5)
))

fig_day_dist.update_layout(
    title='Run Duration Distribution by Day of Week',
    xaxis=dict(title='Day of Week', tickmode='linear', categoryorder='array', categoryarray=list(day_names.values())),
    yaxis=dict(
        title='Duration (seconds)',
    ),
    legend=dict(x=0.1, y=1.1)
)

# Show the plots
fig_hour_dist.show()
fig_week_dist.show()
fig_month_dist.show()
fig_day_dist.show()




