### 1. Distance Matrix Calculation

In [10]:
import pandas as pd
import networkx as nx

def calculate_distance_matrix(df):
    # Create a graph
    G = nx.Graph()
    # Add edges and distances to the graph based on the dataset
    for index, row in df.iterrows():
        source = row['id_start']  # Update this line to match your column names
        target = row['id_end']    # Update this line to match your column names
        distance = row['distance']  # Update this line to match your column names

        # Add bidirectional edges with distances
        G.add_edge(source, target, weight=distance)
        G.add_edge(target, source, weight=distance)

    # Calculate shortest path lengths between nodes
    distances = dict(nx.all_pairs_dijkstra_path_length(G))

    # Create a matrix to store the distances
    nodes = sorted(G.nodes())
    distance_matrix = pd.DataFrame(index=nodes, columns=nodes)

    # Fill the distance matrix with cumulative distances
    for i in nodes:
        for j in nodes:
            if i == j:
                distance_matrix.loc[i, j] = 0  # Diagonal values set to 0
            else:
                # If a direct path exists between i and j, use that distance
                if j in distances[i]:
                    distance_matrix.loc[i, j] = distances[i][j]
                else:
                    # If no direct path exists, set the value as NaN
                    distance_matrix.loc[i, j] = float('nan')

    return distance_matrix

# Read the dataset-3.csv file into a DataFrame
file_path = 'C:\\Users\\HP\\Downloads\\dataset-3.csv'
data = pd.read_csv(file_path)

# Print the column names to see the actual names in your DataFrame
print(data.columns)

# Call the function with the DataFrame
result = calculate_distance_matrix(data)
print(result)


Index(['id_start', 'id_end', 'distance'], dtype='object')
          1001400.0 1001402.0 1001404.0 1001406.0 1001408.0 1001410.0  \
1001400.0         0       9.7      29.9      45.9      67.6      78.7   
1001402.0       9.7         0      20.2      36.2      57.9      69.0   
1001404.0      29.9      20.2         0      16.0      37.7      48.8   
1001406.0      45.9      36.2      16.0         0      21.7      32.8   
1001408.0      67.6      57.9      37.7      21.7         0      11.1   
1001410.0      78.7      69.0      48.8      32.8      11.1         0   
1001412.0      94.3      84.6      64.4      48.4      26.7      15.6   
1001414.0     112.5     102.8      82.6      66.6      44.9      33.8   
1001416.0     125.7     116.0      95.8      79.8      58.1      47.0   
1001418.0     139.3     129.6     109.4      93.4      71.7      60.6   
1001420.0     152.2     142.5     122.3     106.3      84.6      73.5   
1001422.0     161.8     152.1     131.9     115.9      94.2      8

### 2. Unroll Distance Matrix

In [11]:
import itertools

def unroll_distance_matrix(distance_matrix):
    # Get the indices of the distance matrix
    indices = distance_matrix.index.tolist()

    # Create combinations of indices (excluding same id_start to id_end)
    combinations = list(itertools.permutations(indices, 2))

    # Initialize lists to store unrolled data
    id_start = []
    id_end = []
    distance = []

    # Populate the lists with combinations and their distances
    for start, end in combinations:
        id_start.append(start)
        id_end.append(end)
        distance.append(distance_matrix.loc[start, end])

    # Create a DataFrame from the unrolled data
    unrolled_df = pd.DataFrame({
        'id_start': id_start,
        'id_end': id_end,
        'distance': distance
    })

    return unrolled_df

# Assuming 'result' contains the distance matrix DataFrame from Question 1
unrolled_result = unroll_distance_matrix(result)
print(unrolled_result)

       id_start     id_end  distance
0     1001400.0  1001402.0       9.7
1     1001400.0  1001404.0      29.9
2     1001400.0  1001406.0      45.9
3     1001400.0  1001408.0      67.6
4     1001400.0  1001410.0      78.7
...         ...        ...       ...
1801  1004356.0  1001470.0     159.8
1802  1004356.0  1001472.0     175.8
1803  1004356.0  1001488.0       4.0
1804  1004356.0  1004354.0       2.0
1805  1004356.0  1004355.0       4.0

[1806 rows x 3 columns]


### 3. Finding IDs within Percentage Threshold

In [12]:
def find_ids_within_ten_percentage_threshold(df, reference_value):
    # Filter the DataFrame for rows with the given reference_value in id_start
    reference_df = df[df['id_start'] == reference_value]
    
    # Calculate the average distance for the reference_value
    reference_avg_distance = reference_df['distance'].mean()
    
    # Calculate the threshold values (10% of the average distance)
    lower_threshold = reference_avg_distance - (0.1 * reference_avg_distance)
    upper_threshold = reference_avg_distance + (0.1 * reference_avg_distance)
    
    # Filter the DataFrame for id_start values within the threshold
    within_threshold = df[(df['distance'] >= lower_threshold) & (df['distance'] <= upper_threshold)]
    
    # Get unique id_start values within the threshold
    within_threshold_ids = sorted(within_threshold['id_start'].unique())
    
    return within_threshold_ids

# Assuming 'unrolled_result' contains the DataFrame created in the previous step
reference_value = 123  # Replace with the desired reference value
result_within_threshold = find_ids_within_ten_percentage_threshold(unrolled_result, reference_value)
print(result_within_threshold)

[]


### 4. Calculate Toll Rate

In [13]:
def calculate_toll_rate(df):
    # Define rate coefficients for each vehicle type
    rate_coefficients = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }

    # Calculate toll rates for each vehicle type based on distance
    for vehicle, rate in rate_coefficients.items():
        df[vehicle] = df['distance'] * rate

    return df

# Assuming 'unrolled_result' contains the DataFrame created in the previous step
result_with_toll_rates = calculate_toll_rate(unrolled_result)
print(result_with_toll_rates)

       id_start     id_end  distance    moto     car      rv     bus   truck
0     1001400.0  1001402.0       9.7    7.76   11.64   14.55   21.34   34.92
1     1001400.0  1001404.0      29.9   23.92   35.88   44.85   65.78  107.64
2     1001400.0  1001406.0      45.9   36.72   55.08   68.85  100.98  165.24
3     1001400.0  1001408.0      67.6   54.08   81.12  101.40  148.72  243.36
4     1001400.0  1001410.0      78.7   62.96   94.44  118.05  173.14  283.32
...         ...        ...       ...     ...     ...     ...     ...     ...
1801  1004356.0  1001470.0     159.8  127.84  191.76  239.70  351.56  575.28
1802  1004356.0  1001472.0     175.8  140.64  210.96  263.70  386.76  632.88
1803  1004356.0  1001488.0       4.0    3.20    4.80    6.00    8.80   14.40
1804  1004356.0  1004354.0       2.0    1.60    2.40    3.00    4.40    7.20
1805  1004356.0  1004355.0       4.0    3.20    4.80    6.00    8.80   14.40

[1806 rows x 8 columns]


### 5. Calculate Time-Based Toll Rates

In [17]:
from datetime import time

def calculate_time_based_toll_rates(df):
    # Define time intervals and corresponding discount factors
    time_intervals = [
        ((time(0, 0, 0), time(10, 0, 0)), 0.8),   # 00:00:00 to 10:00:00
        ((time(10, 0, 0), time(18, 0, 0)), 1.2),  # 10:00:00 to 18:00:00
        ((time(18, 0, 0), time(23, 59, 59)), 0.8)  # 18:00:00 to 23:59:59
    ]
    
    # Define weekend discount factor
    weekend_discount_factor = 0.7
    
    # Define day names for proper case representation
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    # Convert start and end timestamps to datetime objects
    df['id_start'] = pd.to_datetime(df['id_start'])
    df['id_end'] = pd.to_datetime(df['id_end'])
    
    # Extract day of the week from start_time and end_time
    df['start_day'] = df['id_start'].dt.day_name()
    df['end_day'] = df['id_end'].dt.day_name()
    
    # Calculate toll rates based on time intervals and day of the week
    for interval, discount_factor in time_intervals:
        mask = (df['id_start'].dt.time >= interval[0]) & (df['id_end'].dt.time <= interval[1])
        df.loc[mask & (df['start_day'].isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'])), ['moto', 'car', 'rv', 'bus', 'truck']] *= discount_factor
        
    # Apply constant discount factor for weekends
    weekend_mask = df['start_day'].isin(['Saturday', 'Sunday'])
    df.loc[weekend_mask, ['moto', 'car', 'rv', 'bus', 'truck']] *= weekend_discount_factor
    
    return df

# Assuming 'result_with_toll_rates' contains the DataFrame created in the previous step
result_with_time_based_toll_rates = calculate_time_based_toll_rates(result_with_toll_rates)
print(result_with_time_based_toll_rates)

                          id_start                        id_end  distance  \
0    1970-01-01 00:00:00.001001400 1970-01-01 00:00:00.001001402       9.7   
1    1970-01-01 00:00:00.001001400 1970-01-01 00:00:00.001001404      29.9   
2    1970-01-01 00:00:00.001001400 1970-01-01 00:00:00.001001406      45.9   
3    1970-01-01 00:00:00.001001400 1970-01-01 00:00:00.001001408      67.6   
4    1970-01-01 00:00:00.001001400 1970-01-01 00:00:00.001001410      78.7   
...                            ...                           ...       ...   
1801 1970-01-01 00:00:00.001004356 1970-01-01 00:00:00.001001470     159.8   
1802 1970-01-01 00:00:00.001004356 1970-01-01 00:00:00.001001472     175.8   
1803 1970-01-01 00:00:00.001004356 1970-01-01 00:00:00.001001488       4.0   
1804 1970-01-01 00:00:00.001004356 1970-01-01 00:00:00.001004354       2.0   
1805 1970-01-01 00:00:00.001004356 1970-01-01 00:00:00.001004355       4.0   

         moto      car      rv      bus    truck start_day   en