# 9

In [4]:
import pandas as pd
import numpy as np

def calculate_distance_matrix(file_path):
    data = pd.read_csv(file_path)
    
    if not {'id_start', 'id_end', 'distance'}.issubset(data.columns):
        raise ValueError("The dataset must have 'id_start', 'id_end', and 'distance' columns.")
    
    toll_ids = pd.concat([data['id_start'], data['id_end']]).unique()
    distance_matrix = pd.DataFrame(np.inf, index=toll_ids, columns=toll_ids)
    
    for _, row in data.iterrows():
        distance_matrix.at[row['id_start'], row['id_end']] = row['distance']
        distance_matrix.at[row['id_end'], row['id_start']] = row['distance'] 
    
    np.fill_diagonal(distance_matrix.values, 0)
    
    for k in toll_ids:
        for i in toll_ids:
            for j in toll_ids:
                distance_matrix.at[i, j] = min(distance_matrix.at[i, j], 
                                               distance_matrix.at[i, k] + distance_matrix.at[k, j])
    
    return distance_matrix

df = calculate_distance_matrix('C:/Users/HP/Music/datasett.csv')
print(df)

         1001400  1001402  1001404  1001406  1001408  1001410  1001412  \
1001400      0.0      9.7     29.9     45.9     67.6     78.7     94.3   
1001402      9.7      0.0     20.2     36.2     57.9     69.0     84.6   
1001404     29.9     20.2      0.0     16.0     37.7     48.8     64.4   
1001406     45.9     36.2     16.0      0.0     21.7     32.8     48.4   
1001408     67.6     57.9     37.7     21.7      0.0     11.1     26.7   
1001410     78.7     69.0     48.8     32.8     11.1      0.0     15.6   
1001412     94.3     84.6     64.4     48.4     26.7     15.6      0.0   
1001414    112.5    102.8     82.6     66.6     44.9     33.8     18.2   
1001416    125.7    116.0     95.8     79.8     58.1     47.0     31.4   
1001418    139.3    129.6    109.4     93.4     71.7     60.6     45.0   
1001420    152.2    142.5    122.3    106.3     84.6     73.5     57.9   
1001422    161.8    152.1    131.9    115.9     94.2     83.1     67.5   
1001424    173.2    163.5    143.3    

# 

# 10

In [5]:
import pandas as pd

def unroll_distance_matrix(distance_matrix):
    unrolled_data = []
    
    for id_start in distance_matrix.index:
        for id_end in distance_matrix.columns:
            if id_start != id_end:  
                distance = distance_matrix.at[id_start, id_end]
                unrolled_data.append([id_start, id_end, distance])
    
    unrolled_df = pd.DataFrame(unrolled_data, columns=['id_start', 'id_end', 'distance'])
    
    return unrolled_df

unrolled_df = unroll_distance_matrix(df)
print(unrolled_df)

      id_start   id_end  distance
0      1001400  1001402       9.7
1      1001400  1001404      29.9
2      1001400  1001406      45.9
3      1001400  1001408      67.6
4      1001400  1001410      78.7
...        ...      ...       ...
1801   1001472  1001464      45.8
1802   1001472  1001466      37.3
1803   1001472  1001468      26.6
1804   1001472  1001470      16.0
1805   1001472  1001437     202.2

[1806 rows x 3 columns]


# 11


In [6]:
def find_ids_within_ten_percentage_threshold(df, reference_id):
    ref_df = df[df['id_start'] == reference_id]
    
    if ref_df.empty:
        print(f"Reference id_start {reference_id} not found in the DataFrame.")
        return []  
    
    avg_distance = ref_df['distance'].mean()
    print(f"Average distance for id_start {reference_id}: {avg_distance}")
    
    lower_bound = avg_distance * 0.90 
    upper_bound = avg_distance * 1.10  
    print(f"Threshold: {lower_bound} - {upper_bound}")
    
    avg_distances = df.groupby('id_start')['distance'].mean().reset_index()
    print("Average distances for each id_start:\n", avg_distances)
    
    ids_within_threshold = avg_distances[
        (avg_distances['distance'] >= lower_bound) & (avg_distances['distance'] <= upper_bound)
    ]['id_start'].tolist()
    
    print(f"IDs within threshold: {ids_within_threshold}")
    return sorted(ids_within_threshold)

print("Available id_start values:", unrolled_df['id_start'].unique())

valid_reference_id = 1001402  
result_ids = find_ids_within_ten_percentage_threshold(unrolled_df,  reference_id=valid_reference_id)
print(result_ids)


Available id_start values: [1001400 1001402 1001404 1001406 1001408 1001410 1001412 1001414 1001416
 1001418 1001420 1001422 1001424 1001426 1001428 1001430 1001432 1001434
 1001436 1001438 1001440 1001442 1001488 1004356 1004354 1004355 1001444
 1001446 1001448 1001450 1001452 1001454 1001456 1001458 1001460 1001461
 1001462 1001464 1001466 1001468 1001470 1001437 1001472]
Average distance for id_start 1001402: 234.5261904761905
Threshold: 211.07357142857146 - 257.97880952380956
Average distances for each id_start:
     id_start    distance
0    1001400  243.995238
1    1001402  234.526190
2    1001404  215.769048
3    1001406  201.673810
4    1001408  183.590476
5    1001410  174.869048
6    1001412  163.354762
7    1001414  150.788095
8    1001416  142.302381
9    1001418  134.207143
10   1001420  127.142857
11   1001422  122.342857
12   1001424  117.185714
13   1001426  109.657143
14   1001428  104.014286
15   1001430  101.352381
16   1001432   98.995238
17   1001434   97.302381
18

# 12

In [None]:
import pandas as pd

def calculate_toll_rate(df):
    rate_coefficients = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }
    
    for vehicle, coefficient in rate_coefficients.items():
        df[vehicle] = df['distance'] * coefficient
    
    return df

updated_df = calculate_toll_rate(unrolled_df)
print(updated_df)


# 13
