In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('datasets/dataset-2.csv')

In [3]:
def calculate_distance_matrix(df):

    unique_ids = pd.unique(df[['id_start', 'id_end']].values.ravel('K'))
    distance_matrix = pd.DataFrame(np.inf, index=unique_ids, columns=unique_ids)
    np.fill_diagonal(distance_matrix.values, 0)

    for _, row in df.iterrows():
        distance_matrix.loc[row['id_start'], row['id_end']] = row['distance']
        distance_matrix.loc[row['id_end'], row['id_start']] = row['distance']

    for k in unique_ids:
        for i in unique_ids:
            for j in unique_ids:
                if distance_matrix.loc[i, k] + distance_matrix.loc[k, j] < distance_matrix.loc[i, j]:
                    distance_matrix.loc[i, j] = distance_matrix.loc[i, k] + distance_matrix.loc[k, j]

    return distance_matrix

In [4]:
calculate_distance_matrix(df)

KeyboardInterrupt: 

In [None]:
def unroll_distance_matrix(df):
    distance_matrix = calculate_distance_matrix(df)
    unrolled_df = distance_matrix.reset_index().melt(id_vars='index', 
                                                       var_name='id_end', 
                                                       value_name='distance')
    
    unrolled_df = unrolled_df.rename(columns={'index': 'id_start'})
    
    unrolled_df = unrolled_df[unrolled_df['id_start'] != unrolled_df['id_end']]
    unrolled_df = unrolled_df[unrolled_df['distance'] != float('inf')]
    
    return unrolled_df

In [None]:
unroll_distance_matrix(df)

In [None]:
def find_ids_within_ten_percentage_threshold(df, reference_id)->pd.DataFrame():
    """
    Find all IDs whose average distance lies within 10% of the average distance of the reference ID.

    Args:
        df (pandas.DataFrame)
        reference_id (int)

    Returns:
        pandas.DataFrame: DataFrame with IDs whose average distance is within the specified percentage threshold
                          of the reference ID's average distance.
    """
    # Write your logic here

    ref_distances = df[df['id_start'] == reference_id]

    # Calculate the average distance for the reference_id
    if ref_distances.empty:
        return []  # Return empty if no distances found for the reference ID

    average_distance = ref_distances['distance'].mean()

    # Calculate the threshold values (10% above and below the average distance)
    lower_bound = average_distance * 0.9
    upper_bound = average_distance * 1.1

    # Find IDs within the threshold
    within_threshold = df[
        (df['id_start'] != reference_id) &  # Exclude the reference ID itself
        (df['distance'] >= lower_bound) &
        (df['distance'] <= upper_bound)
        ]

    # Get the unique id_start values within the threshold and sort them
    result_ids = sorted(within_threshold['id_start'].unique())

    return result_ids

In [None]:
find_ids_within_ten_percentage_threshold(df, 1001406)

In [None]:
def calculate_toll_rate(unrolled_df):
    # Define the rate coefficients for each vehicle type
    rate_coefficients = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }
    
    # Calculate toll rates for each vehicle type
    for vehicle, coefficient in rate_coefficients.items():
        unrolled_df[vehicle] = unrolled_df['distance'] * coefficient
    
    return unrolled_df.drop(['distance'], axis=1)

# Example usage:
unrolled_df = unroll_distance_matrix(df)
toll_rates_df = calculate_toll_rate(unrolled_df)
print(toll_rates_df)

In [None]:
import pandas as pd
import numpy as np
from datetime import time, timedelta

def calculate_time_based_toll_rates(unrolled_df):
    # Define days of the week
    days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    # Define time intervals and discount factors
    time_intervals = [
        (time(0, 0), time(10, 0), 0.8),    # Weekdays morning discount
        (time(10, 0), time(18, 0), 1.2),   # Weekdays day rate
        (time(18, 0), time(23, 59, 59), 0.8)  # Weekdays evening discount
    ]
    weekend_discount = 0.7  # Constant for weekends

    # Prepare a list to hold the new rows
    new_rows = []

    # Iterate through each unique id_start and id_end pair
    for (id_start, id_end), group in unrolled_df.groupby(['id_start', 'id_end']):
        for day in days_of_week:
            for start_time, end_time, factor in time_intervals:
                # Add weekday entries
                new_rows.append({
                    'id_start': id_start,
                    'id_end': id_end,
                    'start_day': day,
                    'start_time': start_time,
                    'end_day': day,
                    'end_time': end_time,
                    'moto': group['moto'].values[0] * factor,
                    'car': group['car'].values[0] * factor,
                    'rv': group['rv'].values[0] * factor,
                    'bus': group['bus'].values[0] * factor,
                    'truck': group['truck'].values[0] * factor
                })
        
        # Add weekend entries
        for day in ['Saturday', 'Sunday']:
            new_rows.append({
                'id_start': id_start,
                'id_end': id_end,
                'start_day': day,
                'start_time': time(0, 0),
                'end_day': day,
                'end_time': time(23, 59, 59),
                'moto': group['moto'].values[0] * weekend_discount,
                'car': group['car'].values[0] * weekend_discount,
                'rv': group['rv'].values[0] * weekend_discount,
                'bus': group['bus'].values[0] * weekend_discount,
                'truck': group['truck'].values[0] * weekend_discount
            })

    # Create a new DataFrame from the new rows
    time_based_toll_df = pd.DataFrame(new_rows)
    
    return time_based_toll_df

# Example usage:
unrolled_df = unroll_distance_matrix(df)
toll_rates_time_based_df = calculate_toll_rate(unrolled_df)
toll_rates_time_based_df = calculate_time_based_toll_rates(toll_rates_df)
print(toll_rates_time_based_df)


In [None]:
import pandas as pd
import numpy as np
from datetime import time

def calculate_time_based_toll_rates(unrolled_df):
    # Define days of the week
    days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    # Define time intervals and discount factors
    time_intervals = [
        (time(0, 0), time(10, 0), 0.8),    # Weekdays morning discount
        (time(10, 0), time(18, 0), 1.2),   # Weekdays day rate
        (time(18, 0), time(23, 59, 59), 0.8)  # Weekdays evening discount
    ]
    weekend_discount = 0.7  # Constant for weekends

    # Prepare a list to hold the new rows
    new_rows = []

    # Iterate through each unique id_start and id_end pair
    for (id_start, id_end), group in unrolled_df.groupby(['id_start', 'id_end']):
        # Extract the distance and toll rates from the group
        distance = group['distance'].values[0]  # Assuming distance is the same for the pair
        moto_rate = group['moto'].values[0]
        car_rate = group['car'].values[0]
        rv_rate = group['rv'].values[0]
        bus_rate = group['bus'].values[0]
        truck_rate = group['truck'].values[0]

        # Add weekday entries
        for day in days_of_week:
            for start_time, end_time, factor in time_intervals:
                new_rows.append({
                    'id_start': id_start,
                    'id_end': id_end,
                    'distance': distance,
                    'start_day': day,
                    'start_time': start_time,
                    'end_day': day,
                    'end_time': end_time,
                    'moto': moto_rate * factor,
                    'car': car_rate * factor,
                    'rv': rv_rate * factor,
                    'bus': bus_rate * factor,
                    'truck': truck_rate * factor
                })
        
        # Add weekend entries
        for day in ['Saturday', 'Sunday']:
            new_rows.append({
                'id_start': id_start,
                'id_end': id_end,
                'distance': distance,
                'start_day': day,
                'start_time': time(0, 0),
                'end_day': day,
                'end_time': time(23, 59, 59),
                'moto': moto_rate * weekend_discount,
                'car': car_rate * weekend_discount,
                'rv': rv_rate * weekend_discount,
                'bus': bus_rate * weekend_discount,
                'truck': truck_rate * weekend_discount
            })

    # Create a new DataFrame from the new rows
    time_based_toll_df = pd.DataFrame(new_rows)
    
    return time_based_toll_df

unrolled_df = unroll_distance_matrix(df)
toll_rates_time_based_df = calculate_toll_rate(unrolled_df)
toll_rates_time_based_df = calculate_time_based_toll_rates(toll_rates_df)
print(toll_rates_time_based_df)


In [None]:
import pandas as pd
from datetime import time

def calculate_time_based_toll_rates(unrolled_df):
    # Define days of the week
    days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    # Define time intervals and discount factors
    time_intervals = [
        (time(0, 0), time(10, 0), 0.8),    # Weekdays morning discount
        (time(10, 0), time(18, 0), 1.2),   # Weekdays day rate
        (time(18, 0), time(23, 59, 59), 0.8)  # Weekdays evening discount
    ]
    weekend_discount = 0.7  # Constant for weekends

    # Prepare a list to hold the new rows
    new_rows = []

    # Iterate through each unique id_start and id_end pair
    for (id_start, id_end), group in unrolled_df.groupby(['id_start', 'id_end']):
        # Extract the distance and toll rates from the group
        distance = group['distance'].values[0]  # Assuming distance is the same for the pair
        moto_rate = group['moto'].values[0]
        car_rate = group['car'].values[0]
        rv_rate = group['rv'].values[0]
        bus_rate = group['bus'].values[0]
        truck_rate = group['truck'].values[0]

        # Add weekday entries
        i=0
        for day in days_of_week:
            for start_time, end_time, factor in time_intervals:
                new_rows.append({
                    'id_start': id_start,
                    'id_end': id_end,
                    'distance': distance,
                    'start_day': days_of_week,
                    'start_time': start_time,
                    'end_day': day,
                    'end_time': end_time,
                    'moto': moto_rate * factor,
                    'car': car_rate * factor,
                    'rv': rv_rate * factor,
                    'bus': bus_rate * factor,
                    'truck': truck_rate * factor
                })
        
        # Add weekend entries
        for day in ['Saturday', 'Sunday']:
            new_rows.append({
                'id_start': id_start,
                'id_end': id_end,
                'distance': distance,
                'start_day': day,
                'start_time': time(0, 0),
                'end_day': day,
                'end_time': time(23, 59, 59),
                'moto': moto_rate * weekend_discount,
                'car': car_rate * weekend_discount,
                'rv': rv_rate * weekend_discount,
                'bus': bus_rate * weekend_discount,
                'truck': truck_rate * weekend_discount
            })

    # Create a new DataFrame from the new rows
    time_based_toll_df = pd.DataFrame(new_rows)
    
    return time_based_toll_df

unrolled_df = unroll_distance_matrix(df)
toll_rates_time_based_df = calculate_toll_rate(unrolled_df)
toll_rates_time_based_df = calculate_time_based_toll_rates(toll_rates_df)
print(toll_rates_time_based_df)


In [None]:
import pandas as pd

def check_time_completeness(df):
    # Create a multi-index for the output
    index = pd.MultiIndex.from_frame(df[['id', 'id_2']].drop_duplicates())
    
    # Prepare a Series to hold the results
    results = pd.Series(False, index=index)

    # Iterate through each unique (id, id_2) pair
    for (id_val, id_2_val), group in df.groupby(['id', 'id_2']):
        # Initialize sets for days and time coverage
        days_covered = set()
        time_ranges = []

        for _, row in group.iterrows():
            # Collect days
            days_covered.add(row['startDay'])
            days_covered.add(row['endDay'])
            
            # Collect time ranges as tuples
            time_ranges.append((row['startTime'], row['endTime'], row['startDay'], row['endDay']))

        # Check if all days are covered
        all_days = {'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'}
        is_days_covered = days_covered >= all_days
        
        # Check if the time ranges cover a full 24-hour period
        is_full_24_hours = False
        
        # Normalize time strings to seconds for comparison
        time_slots = []
        for start_time, end_time, start_day, end_day in time_ranges:
            start_seconds = int(start_time.split(':')[0]) * 3600 + int(start_time.split(':')[1]) * 60 + int(start_time.split(':')[2])
            end_seconds = int(end_time.split(':')[0]) * 3600 + int(end_time.split(':')[1]) * 60 + int(end_time.split(':')[2])
            time_slots.append((start_seconds, end_seconds))

        # Check for 24-hour coverage
        # Here, we can consider all time slots in a single day
        total_time_covered = [0] * 86400  # seconds in a day

        for start_seconds, end_seconds in time_slots:
            if start_seconds < end_seconds:
                for sec in range(start_seconds, end_seconds):
                    total_time_covered[sec] = 1
            else:  # Handle overnight spans (e.g., 23:00 to 01:00)
                for sec in range(start_seconds, 86400):
                    total_time_covered[sec] = 1
                for sec in range(0, end_seconds):
                    total_time_covered[sec] = 1
        
        # If all 86400 seconds are covered
        is_full_24_hours = all(total_time_covered)

        # Store the result for the current (id, id_2) pair
        results[(id_val, id_2_val)] = not (is_days_covered and is_full_24_hours)

    return results

# Example usage
df = pd.read_csv('datasets/dataset-1.csv')
result_series = check_time_completeness(df)
print(result_series)


In [4]:
import polyline
def polyline_to_dataframe(polyline_str: str) -> pd.DataFrame:
    """
    Converts a polyline string into a DataFrame with latitude, longitude, and distance between consecutive points.
    
    Args:
        polyline_str (str): The encoded polyline string.

    Returns:
        pd.DataFrame: A DataFrame containing latitude, longitude, and distance in meters.
    """
    df = pd.DataFrame({'polyline': [polyline_str]})

    # Decode the polyline and extract latitude and longitude
    df['lat/long'] = df['polyline'].map(lambda x: polyline.decode(x))

    # Expand lat/long into separate columns
    df[['lat', 'long']] = pd.DataFrame(df['lat/long'].tolist(), index=df.index)

    # Calculate distances using the Haversine formula directly
    distances = [0.0]  # First distance is 0

    for i in range(1, len(df)):
        lat1, lon1 = df.iloc[i - 1][['lat', 'long']]
        lat2, lon2 = df.iloc[i][['lat', 'long']]

        # Haversine calculation directly in the loop
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        c = 2 * np.arcsin(np.sqrt(a))
        r = 6371000  # Earth radius in meters
        distance = c * r

        distances.append(distance)

    df['distance'] = distances

    return df[['lat', 'long', 'distance']]
print(polyline_to_dataframe('onl~Fj|cvOrsEg}@rHuvK'))

ModuleNotFoundError: No module named 'polyline'

In [5]:
!pip install polyline



In [6]:
import polyline

ModuleNotFoundError: No module named 'polyline'

In [8]:
import polyline
print(polyline.__version__)



ModuleNotFoundError: No module named 'polyline'

In [2]:
!pip uninstall polyline


^C


In [3]:
!pip install polyline


