# Aggregating data for D3

# Import data and packages

In [13]:
import pandas as pd
from datetime import timedelta
df = pd.read_csv('data/ridership.csv')

display(df.head())

Unnamed: 0,id,closed_status,duration,start_id,start_date,start_station_name,end_date,end_id,end_station_name,rider_type,...,start_time_only,start_time_seconds,end_date_only,end_time_only,end_time_seconds,day_of_week,start_lat,start_lon,end_lat,end_lon
0,29681508,NORMAL,62,7948,2024-06-30 03:16:00,Bathurst St / Housey St,2024-06-30 03:17:00,7948,Bathurst St / Housey St,MEMBER,...,03:16:00,11760,2024-06-30,03:17:00,11822,Sunday,43.637819,-79.400132,43.637819,-79.400132
1,29682786,NORMAL,876,7948,2024-06-30 08:18:00,Bathurst St / Housey St,2024-06-30 08:32:00,7938,Portland St / Wellington St W,MEMBER,...,08:18:00,29880,2024-06-30,08:32:00,30756,Sunday,43.637819,-79.400132,43.642902,-79.39937
2,29644869,NORMAL,766,7948,2024-06-28 16:04:00,Bathurst St / Housey St,2024-06-28 16:16:00,7927,Strachan Ave / East Liberty St - SMART,MEMBER,...,16:04:00,57840,2024-06-28,16:16:00,58606,Friday,43.637819,-79.400132,43.639065,-79.41081
3,29672434,NORMAL,1333,7948,2024-06-29 18:39:00,Bathurst St / Housey St,2024-06-29 19:02:00,7927,Strachan Ave / East Liberty St - SMART,MEMBER,...,18:39:00,67140,2024-06-29,19:02:00,68473,Saturday,43.637819,-79.400132,43.639065,-79.41081
4,29691472,NORMAL,1002,7948,2024-06-30 14:07:00,Bathurst St / Housey St,2024-06-30 14:24:00,7802,King St W / Jameson Ave - SMART,MEMBER,...,14:07:00,50820,2024-06-30,14:24:00,51822,Sunday,43.637819,-79.400132,43.637358,-79.43618


# Duration

### Duration histogram data

In [30]:
# Filtering for trips less n seconds
max_duration = 3600 

df_filtered = df[df['duration'] < max_duration][['id', 'duration', 'bike_model']]

# Bin size (in seconds)
bins = range(0, df_filtered['duration'].max() + 60, 60) 

labels = [f"{i}-{i+9}" for i in bins[:-1]]

df_filtered['duration_bin'] = pd.cut(df_filtered['duration'], bins=bins, right=False)

duration_counts = df_filtered.groupby(['duration_bin', 'bike_model']).size().unstack(fill_value=0)

duration_counts_df = duration_counts.reset_index()

# Interval start
duration_counts_df['interval_start'] = duration_counts_df['duration_bin'].apply(lambda x: int(x.left))

# Normalize the "EFIT" column
total_efit = duration_counts_df['EFIT'].sum()
duration_counts_df['EFIT_normalized'] = duration_counts_df['EFIT'] / total_efit

# Normalize the "ICONIC" column
total_iconic = duration_counts_df['ICONIC'].sum()
duration_counts_df['ICONIC_normalized'] = duration_counts_df['ICONIC'] / total_iconic

duration_counts_df.to_csv('../../static/duration_counts.csv', index=False)

  duration_counts = df_filtered.groupby(['duration_bin', 'bike_model']).size().unstack(fill_value=0)


### Duration stat table

In [4]:
# Filter for specific bike models
filtered_models = ['ICONIC', 'EFIT']
filtered_df = df[df['bike_model'].isin(filtered_models)]

# Get unique bike models from the filtered DataFrame
bike_models = filtered_df['bike_model'].unique()

# Calculate statistics from filtered DataFrame
stats_filtered = filtered_df.groupby('bike_model')['duration'].agg(['mean', 'median', 'std']).reset_index()

# Calculate min and max from original DataFrame
stats_original = df[df['bike_model'].isin(filtered_models)].groupby('bike_model')['duration'].agg(['min', 'max']).reset_index()

# Merge the statistics
stats = pd.merge(stats_filtered, stats_original, on='bike_model')

# Round statistics to 2 decimal places
stats = stats.round(2)

# Sort statistics to show ICONIC first
stats = stats.set_index('bike_model').loc[['ICONIC', 'EFIT']].reset_index()

# Display the statistics
print(stats)

stats.to_csv('../../static/duration_stats.csv', index=False)

  bike_model     mean  median      std  min      max
0     ICONIC  1086.91   720.0  8078.08    0  2148980
1       EFIT  1091.16   747.0  4482.35    0   870452


# Distance

### Distance histogram data

In [25]:
# Filtering for trips less n meters
max_distance = 12000 

df_filtered = df[df['distance_average'] < max_distance][['id', 'distance_average', 'bike_model']]

# Bin size (in meters)
bins = range(0, int(df_filtered['distance_average'].max()) + 100, 100) 

labels = [f"{i}-{i+9}" for i in bins[:-1]]

df_filtered['distance_bin'] = pd.cut(df_filtered['distance_average'], bins=bins, right=False)

distance_counts = df_filtered.groupby(['distance_bin', 'bike_model']).size().unstack(fill_value=0)

distance_counts_df = distance_counts.reset_index()

# Interval start
distance_counts_df['interval_start'] = distance_counts_df['distance_bin'].apply(lambda x: int(x.left))

# Normalize the "EFIT" column
total_efit = distance_counts_df['EFIT'].sum()
distance_counts_df['EFIT_normalized'] = distance_counts_df['EFIT'] / total_efit

# Normalize the "ICONIC" column
total_iconic = distance_counts_df['ICONIC'].sum()
distance_counts_df['ICONIC_normalized'] = distance_counts_df['ICONIC'] / total_iconic

distance_counts_df.to_csv('../../static/distance_counts.csv', index=False)

distance_counts_df


  distance_counts = df_filtered.groupby(['distance_bin', 'bike_model']).size().unstack(fill_value=0)


bike_model,distance_bin,EFIT,ICONIC,interval_start,EFIT_normalized,ICONIC_normalized
0,"[0, 100)",126,498,0,0.000958,0.000792
1,"[100, 200)",238,1451,100,0.001809,0.002308
2,"[200, 300)",418,2803,200,0.003178,0.004458
3,"[300, 400)",682,4798,300,0.005185,0.007630
4,"[400, 500)",1009,7766,400,0.007671,0.012350
...,...,...,...,...,...,...
115,"[11500, 11600)",69,132,11500,0.000525,0.000210
116,"[11600, 11700)",64,132,11600,0.000487,0.000210
117,"[11700, 11800)",52,103,11700,0.000395,0.000164
118,"[11800, 11900)",61,112,11800,0.000464,0.000178


### Distance stat table

In [6]:
# Filter for specific bike models
filtered_models = ['ICONIC', 'EFIT']
filtered_df = df[df['bike_model'].isin(filtered_models)]

# Get unique bike models from the filtered DataFrame
bike_models = filtered_df['bike_model'].unique()

# Calculate statistics from filtered DataFrame
stats_filtered = filtered_df.groupby('bike_model')['distance_average'].agg(['mean', 'median', 'std']).reset_index()

# Calculate min and max from original DataFrame
stats_original = df[df['bike_model'].isin(filtered_models)].groupby('bike_model')['distance_average'].agg(['min', 'max']).reset_index()

# Merge the statistics
stats = pd.merge(stats_filtered, stats_original, on='bike_model')

# Round statistics to 2 decimal places
stats = stats.round(2)

# Sort statistics to show ICONIC first
stats = stats.set_index('bike_model').loc[['ICONIC', 'EFIT']].reset_index()

# Display the statistics
print(stats)

stats.to_csv('../../static/distance_stats.csv', index=False)

  bike_model     mean   median      std  min      max
0     ICONIC  2768.05  2303.09  1944.19  0.0  29671.7
1       EFIT  3390.23  2806.44  2391.33  0.0  27845.2


# Elevation

### Elevation histogram data

In [21]:
max_elevation = 110
min_elevation = -110 

# max_elevation = df['elevation_delta_average'].max()
# min_elevation = df['elevation_delta_average'].min()

df_filtered = df[(df['elevation_delta_average'] < max_elevation) & (df['elevation_delta_average'] > min_elevation)][['id', 'elevation_delta_average', 'bike_model']]

# Bin size (in meters)
bins = range(int(df_filtered['elevation_delta_average'].min()), int(df_filtered['elevation_delta_average'].max()) + 2, 2) 

labels = [f"{i}-{i+9}" for i in bins[:-1]]

df_filtered['elevation_bin'] = pd.cut(df_filtered['elevation_delta_average'], bins=bins, right=False)

elevation_counts = df_filtered.groupby(['elevation_bin', 'bike_model']).size().unstack(fill_value=0)

elevation_counts_df = elevation_counts.reset_index()

elevation_counts_df['interval_start'] = elevation_counts_df['elevation_bin'].apply(lambda x: int(x.left))

# Normalize the "EFIT" column
total_efit = elevation_counts_df['EFIT'].sum()
elevation_counts_df['EFIT_normalized'] = elevation_counts_df['EFIT'] / total_efit

# Normalize the "ICONIC" column
total_iconic = elevation_counts_df['ICONIC'].sum()
elevation_counts_df['ICONIC_normalized'] = elevation_counts_df['ICONIC'] / total_iconic

elevation_counts_df.to_csv('../../static/elevation_counts.csv', index=False)

elevation_counts_df

  elevation_counts = df_filtered.groupby(['elevation_bin', 'bike_model']).size().unstack(fill_value=0)


bike_model,elevation_bin,EFIT,ICONIC,interval_start,EFIT_normalized,ICONIC_normalized
0,"[-109, -107)",12,82,-109,0.000091,0.000130
1,"[-107, -105)",15,57,-107,0.000113,0.000091
2,"[-105, -103)",15,134,-105,0.000113,0.000213
3,"[-103, -101)",27,222,-103,0.000204,0.000353
4,"[-101, -99)",32,181,-101,0.000242,0.000288
...,...,...,...,...,...,...
104,"[99, 101)",23,142,99,0.000174,0.000226
105,"[101, 103)",17,164,101,0.000128,0.000261
106,"[103, 105)",18,77,103,0.000136,0.000122
107,"[105, 107)",7,34,105,0.000053,0.000054


### Elevation stat table

In [34]:
# Filter for specific bike models
filtered_models = ['ICONIC', 'EFIT']
filtered_df = df[df['bike_model'].isin(filtered_models)]

# Get unique bike models from the filtered DataFrame
bike_models = filtered_df['bike_model'].unique()

# Calculate statistics from filtered DataFrame
stats_filtered = filtered_df.groupby('bike_model')['elevation_delta_average'].agg(['mean', 'median', 'std']).reset_index()

# Calculate absolute mean and absolute median
stats_filtered['abs_mean'] = filtered_df.groupby('bike_model')['elevation_delta_average'].apply(lambda x: x.abs().mean()).values
stats_filtered['abs_median'] = filtered_df.groupby('bike_model')['elevation_delta_average'].apply(lambda x: x.abs().median()).values

# Calculate min and max from original DataFrame
stats_original = df[df['bike_model'].isin(filtered_models)].groupby('bike_model')['elevation_delta_average'].agg(['min', 'max']).reset_index()

# Merge the statistics
stats = pd.merge(stats_filtered, stats_original, on='bike_model')

# Round statistics to 2 decimal places
stats = stats.round(2)

# Sort statistics to show ICONIC first
stats = stats.set_index('bike_model').loc[['ICONIC', 'EFIT']].reset_index()

# Display the statistics
print(stats)

stats.to_csv('../../static/elevation_stats.csv', index=False)

  bike_model  mean  median    std  abs_mean  abs_median    min    max
0     ICONIC -1.35    -0.5  21.49     14.69        10.0 -179.5  182.0
1       EFIT -0.89     0.0  24.81     17.85        13.0 -173.0  179.5


## Radial data

### 24 hr

In [14]:
# Convert start time in seconds to half-hours
half_hours = df['start_time_seconds'] // 1800  # 1800 seconds = 30 minutes

# Add half_hours to the DataFrame
df['half_hours'] = half_hours

# Create half_hour_intervals based on half_hours
df['half_hour_intervals'] = df['half_hours'].apply(lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M'))

# Group by 'bike_model' and 'half_hours', then count occurrences
grouped = df.groupby(['bike_model', 'half_hours']).size().unstack(fill_value=0)

# Normalize across each model separately
normalized_grouped = grouped.div(grouped.sum(axis=1), axis=0)  # Normalize across rows (bike models)

# Transpose to get half_hour_intervals as columns
freq_counts = grouped.transpose()
normalized_freq_counts = normalized_grouped.transpose()

# Fix: Convert half_hours to half_hour_intervals for the resulting DataFrame
freq_counts['half_hour_intervals'] = freq_counts.index.to_series().apply(lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M'))
normalized_freq_counts['half_hour_intervals'] = normalized_freq_counts.index.to_series().apply(lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M'))

# Merge the unnormalized and normalized data
merged_freq_counts = freq_counts.merge(normalized_freq_counts, on='half_hour_intervals', suffixes=('', '_normalized'))

# Rearranging columns: Move 'half_hour_intervals' to the front
merged_freq_counts = merged_freq_counts[['half_hour_intervals'] + [col for col in merged_freq_counts.columns if col != 'half_hour_intervals']]

# Output the DataFrame to check the result
print(merged_freq_counts)

# Save to CSV
# merged_freq_counts.to_csv('../../static/freq_counts.csv', index=True)

bike_model half_hour_intervals  EFIT  ICONIC  EFIT_normalized  \
0                        00:00  1228    5208         0.009250   
1                        00:30  1023    3878         0.007705   
2                        01:00   753    3083         0.005672   
3                        01:30   680    2596         0.005122   
4                        02:00   607    2358         0.004572   
5                        02:30   528    1984         0.003977   
6                        03:00   391    1329         0.002945   
7                        03:30   283     830         0.002132   
8                        04:00   200     730         0.001506   
9                        04:30   247     848         0.001860   
10                       05:00   349    1183         0.002629   
11                       05:30   600    2139         0.004519   
12                       06:00   763    3010         0.005747   
13                       06:30  1427    5947         0.010748   
14                       

### 24hr by day of week 

In [12]:
import pandas as pd
from datetime import timedelta

# List to store results for each day
results = []

# Iterate over each unique day_of_week
for day in df['day_of_week'].unique():
    # Filter the DataFrame for the current day_of_week
    filtered_df = df[df['day_of_week'] == day]
    
    # Convert start time in seconds to half-hours
    half_hours = filtered_df['start_time_seconds'] // 1800  # 1800 seconds = 30 minutes

    # Add half_hours to the filtered DataFrame
    filtered_df['half_hours'] = half_hours

    # Create half_hour_intervals based on half_hours
    filtered_df['half_hour_intervals'] = filtered_df['half_hours'].apply(
        lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M')
    )

    # Group by 'bike_model' and 'half_hours', then count occurrences
    grouped = filtered_df.groupby(['bike_model', 'half_hours']).size().unstack(fill_value=0)

    # Normalize across each model separately
    normalized_grouped = grouped.div(grouped.sum(axis=1), axis=0)  # Normalize across rows (bike models)

    # Transpose to get half_hour_intervals as columns
    freq_counts = grouped.transpose()
    normalized_freq_counts = normalized_grouped.transpose()

    # Convert half_hours to half_hour_intervals for the resulting DataFrame
    freq_counts['half_hour_intervals'] = freq_counts.index.to_series().apply(
        lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M')
    )
    normalized_freq_counts['half_hour_intervals'] = normalized_freq_counts.index.to_series().apply(
        lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M')
    )

    # Merge the unnormalized and normalized data for this day
    merged_freq_counts = freq_counts.merge(
        normalized_freq_counts, on='half_hour_intervals', suffixes=('', '_normalized')
    )

    # Rearranging columns: Move 'half_hour_intervals' to the front
    merged_freq_counts = merged_freq_counts[
        ['half_hour_intervals'] + [col for col in merged_freq_counts.columns if col != 'half_hour_intervals']
    ]

    # Add a column to indicate the day of the week
    merged_freq_counts['day_of_week'] = day

    # Append the result for the current day to the list
    results.append(merged_freq_counts)

# Concatenate all results into a single DataFrame
csv_result = pd.concat(results, ignore_index=True)
print(csv_result)
# Save to CSV
# csv_result.to_csv('../../static/freq_counts_by_day.csv', index=False)

# import json

# json_result = csv_result.groupby('day_of_week').apply(lambda x: x.drop(columns='day_of_week').to_dict(orient='records')).to_dict()
# json_result

# with open('../../src/data/freq_counts_by_day.json', 'w') as json_file:
#     json.dump(json_result, json_file, indent=4)

# Output the DataFrame to check the result




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hours'] = half_hours
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hour_intervals'] = filtered_df['half_hours'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hours'] = half_hours
A value is trying to be set on a copy of a slice from a D

bike_model half_hour_intervals  EFIT  ICONIC  EFIT_normalized  \
0                        00:00   242    1117         0.012521   
1                        00:30   207    1013         0.010710   
2                        01:00   210     859         0.010865   
3                        01:30   180     866         0.009313   
4                        02:00   170     762         0.008796   
..                         ...   ...     ...              ...   
331                      21:30   372    1633         0.019602   
332                      22:00   300    1371         0.015808   
333                      22:30   271    1185         0.014280   
334                      23:00   322    1316         0.016967   
335                      23:30   174     704         0.009169   

bike_model  ICONIC_normalized day_of_week  
0                    0.012603      Sunday  
1                    0.011429      Sunday  
2                    0.009692      Sunday  
3                    0.009771      Sunday  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hour_intervals'] = filtered_df['half_hours'].apply(


### 24hr by Weekday / Weekend

In [15]:
import pandas as pd
from datetime import timedelta

# List to store results for each category
results = []

# Define a function to categorize days
def categorize_day(day):
    if day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']:
        return 'Weekday'
    else:
        return 'Weekend'

# Add a new column for category (Weekday or Weekend)
df['day_category'] = df['day_of_week'].apply(categorize_day)

# Iterate over each unique category
for category in df['day_category'].unique():
    # Filter the DataFrame for the current category
    filtered_df = df[df['day_category'] == category]
    
    # Convert start time in seconds to half-hours
    half_hours = filtered_df['start_time_seconds'] // 1800  # 1800 seconds = 30 minutes

    # Add half_hours to the filtered DataFrame
    filtered_df['half_hours'] = half_hours

    # Create half_hour_intervals based on half_hours
    filtered_df['half_hour_intervals'] = filtered_df['half_hours'].apply(
        lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M')
    )

    # Group by 'bike_model' and 'half_hours', then count occurrences
    grouped = filtered_df.groupby(['bike_model', 'half_hours']).size().unstack(fill_value=0)

    # Normalize across each model separately
    normalized_grouped = grouped.div(grouped.sum(axis=1), axis=0)  # Normalize across rows (bike models)

    # Transpose to get half_hour_intervals as columns
    freq_counts = grouped.transpose()
    normalized_freq_counts = normalized_grouped.transpose()

    # Convert half_hours to half_hour_intervals for the resulting DataFrame
    freq_counts['half_hour_intervals'] = freq_counts.index.to_series().apply(
        lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M')
    )
    normalized_freq_counts['half_hour_intervals'] = normalized_freq_counts.index.to_series().apply(
        lambda x: (pd.Timestamp('1970-01-01') + timedelta(minutes=x*30)).strftime('%H:%M')
    )

    # Merge the unnormalized and normalized data for this category
    merged_freq_counts = freq_counts.merge(
        normalized_freq_counts, on='half_hour_intervals', suffixes=('', '_normalized')
    )

    # Rearranging columns: Move 'half_hour_intervals' to the front
    merged_freq_counts = merged_freq_counts[
        ['half_hour_intervals'] + [col for col in merged_freq_counts.columns if col != 'half_hour_intervals']
    ]

    # Add a column to indicate the category (Weekday or Weekend)
    merged_freq_counts['day_category'] = category

    # Append the result for the current category to the list
    results.append(merged_freq_counts)

# Concatenate all results into a single DataFrame
csv_result = pd.concat(results, ignore_index=True)
print(csv_result)

# Save to CSV
csv_result.to_csv('../../static/freq_counts_by_day_category.csv', index=False)

# Optional: Convert to JSON format
# json_result = csv_result.groupby('day_category').apply(
#     lambda x: x.drop(columns='day_category').to_dict(orient='records')
# ).to_dict()

# Save the JSON result
# with open('../../src/data/freq_counts_by_day.json', 'w') as json_file:
#     json.dump(json_result, json_file, indent=4)

# Output the DataFrame to check the result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hours'] = half_hours
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hour_intervals'] = filtered_df['half_hours'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hours'] = half_hours


bike_model half_hour_intervals  EFIT  ICONIC  EFIT_normalized  \
0                        00:00   541    2484         0.013738   
1                        00:30   453    2041         0.011503   
2                        01:00   414    1846         0.010513   
3                        01:30   340    1670         0.008634   
4                        02:00   336    1597         0.008532   
..                         ...   ...     ...              ...   
91                       21:30  1800    9708         0.019276   
92                       22:00  1482    7974         0.015870   
93                       22:30  1433    6675         0.015346   
94                       23:00  1433    6644         0.015346   
95                       23:30  1019    4707         0.010912   

bike_model  ICONIC_normalized day_category  
0                    0.013017      Weekend  
1                    0.010695      Weekend  
2                    0.009673      Weekend  
3                    0.008751      Week

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['half_hour_intervals'] = filtered_df['half_hours'].apply(
