# DISTANCE MATRIX

## Libraries

In [41]:
import googlemaps
import numpy as np
import os
import pandas as pd
from scipy.spatial.distance import cdist
import re

## Constants

In [42]:
input_folder_path = "/Users/sammcmanagan/Library/Mobile Documents/com~apple~CloudDocs/Documents/M.Sc Statistics & Data Science/Modern Data Analytics/MDA-Project/Data"
output_folder_path = "/Users/Jovan/Desktop/MDA_Project/DistanceMatrices"

cities = ["Antwerpen", "Brugge", "Brussels", "Charleroi", "Gent", "Leuven", "Liege", "Oostende"]

GOOGLE_API_KEY = ""
gmaps = googlemaps.Client(key = GOOGLE_API_KEY)

## Functions

In [43]:
# Adjusts coordinates so they suit directions() function
def format_coordinates(latitude, longitude):
    return f"{latitude}, {longitude}"

# Extracts latitude and longitude from geometry column
def extract_coordinates(geometry):
    match = re.match(r'POINT \(([^ ]+) ([^ ]+)\)', geometry)
    if match:
        longitude, latitude = match.groups()
        return float(latitude), float(longitude)
    else:
        raise ValueError("Invalid geometry format")

# Calculates distance matrix for given sets of origin and destination coordinates
def get_distance_matrix(origins, destinations, mode, coef = 1):
    num_origins = len(origins)
    num_destinations = len(destinations)
    result_matrix = np.zeros((num_origins, num_destinations))

    confirmation = input(f"This will initialize {num_origins*num_destinations} calculations. Are you sure? (yes/no): ")
    if confirmation == "yes":
        for i in range(0, num_origins):
            for j in range(0, num_destinations):
                orig = format_coordinates(origins.iloc[i, 0], origins.iloc[i, 1])
                dest = format_coordinates(destinations.iloc[j, 0], destinations.iloc[j, 1])
                print("Calculating pair: [", i, ",", j, "]")
                result_matrix[i][j] = gmaps.directions(orig, dest, mode)[0]['legs'][0]['duration']['value']
                print("Time:", result_matrix[i][j], "\n")

    return(result_matrix * coef)

## Calculating Distance Matrix

In [52]:
# Number of closest AEDs to consider
closest_aeds = 10

city_data = {}

for city in cities:
    # Import data on the city
    os.chdir(import_folder_path)
    aeds = pd.read_csv(city + "_aeds.csv")
    cards = pd.read_csv(city + "_cards_train.csv")
    new_aeds = pd.read_csv(city + "_new_aeds.csv")
    print("Importing " + city + " successful.")
    
    # Make 'mandatory' columns for aeds and vehicles
    aeds['public'] = aeds['public'].fillna(0)
    aeds['public'] = ~aeds['public'].astype(bool)
    aeds = aeds.rename(columns={'public': 'mandatory'})

    aeds_coordinates = aeds[['latitude', 'longitude']]
    
    # Extract latitude and longitude from geometry column in new_aeds
    new_aeds[['latitude', 'longitude']] = new_aeds['geometry'].apply(
        lambda x: pd.Series(extract_coordinates(x))
    )
    new_aeds_coordinates = new_aeds[['latitude', 'longitude']]
    
    cards_coordinates = cards[['latitude', 'longitude']]
    
    new_aeds['mandatory'] = 0
    print("Made both 'mandatory' columns.\n")
    
    # Combine aeds_coordinates and new_aeds_coordinates
    combined_aeds_coordinates = pd.concat([aeds_coordinates, new_aeds_coordinates], ignore_index=True)
    
    # Combine mandatory columns
    combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)
    
    
    combined_mandatory.to_csv(os.path.join(f"{city}_mandatory_array.csv"), index=False)
    
    # Calculate the Euclidean distance matrix
    distance_matrix = cdist(cards_coordinates, combined_aeds_coordinates, metric='euclidean')  # Transposed
    
    # Process the distance matrix to mark the closest AEDs
    closest_matrix = np.zeros_like(distance_matrix)
    for row in range(distance_matrix.shape[0]):
        row_indices = np.argsort(distance_matrix[row, :])[:closest_aeds]
        closest_matrix[row, row_indices] = 1
    
    # Convert coordinates to string for use as row names
    cards_coordinates_str = cards_coordinates.apply(lambda x: f"{x['latitude']}, {x['longitude']}", axis=1)
    
    # Create DataFrame with MultiIndex using cards_coordinates and combined_aeds_coordinates
    closest_matrix_df = pd.DataFrame(closest_matrix, index=cards_coordinates_str, columns=combined_aeds_coordinates.apply(tuple, axis=1))
    
    # Change 0 values to 1000 for sake of distance matrix
    closest_matrix_df.replace(0, 1000, inplace=True)
    
    
    # Store the processed data for the city in the dictionary
    city_data[city] = closest_matrix_df

Importing Antwerpen successful.
Made both 'mandatory' columns.



  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)


Importing Brugge successful.
Made both 'mandatory' columns.

Importing Brussels successful.


  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)


Made both 'mandatory' columns.



  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)


Importing Charleroi successful.
Made both 'mandatory' columns.

Importing Gent successful.
Made both 'mandatory' columns.



  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)
  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)


Importing Leuven successful.
Made both 'mandatory' columns.

Importing Liege successful.
Made both 'mandatory' columns.



  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)
  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)


Importing Oostende successful.
Made both 'mandatory' columns.



  combined_mandatory = pd.concat([aeds['mandatory'], new_aeds['mandatory']], ignore_index=True)


In [47]:
# Function to calculate walking distance between two sets of coordinates
def calculate_distance(origin, destination, gmaps):
    try:
        # Call the Distance Matrix API
        result = gmaps.distance_matrix(origins=[origin], destinations=[destination], mode="walking")
        # Check if the response contains the distance information
        if 'rows' in result and len(result['rows']) > 0:
            elements = result['rows'][0]['elements']
            if len(elements) > 0 and 'distance' in elements[0]:
                # Parse the result and extract the walking distance
                distance = elements[0]['distance']['value']  # Distance in kilometers
                return distance
    except Exception as e:
        print(f"Error calculating distance between {origin} and {destination}: {e}")
    # Return a default value if distance calculation fails
    return float('inf')  # or any other default value like -1 or 0

# Function to replace all 1 entries with calculated distances
def replace_1_with_distances(closest_matrix_df, gmaps):
    for row_idx in range(closest_matrix_df.shape[0]):
        for col_idx in range(closest_matrix_df.shape[1]):
            if closest_matrix_df.iat[row_idx, col_idx] == 1:
                # Get the origin coordinates from the row index
                origin_coords = closest_matrix_df.index[row_idx]
                # Get the destination coordinates from the column names
                destination_coords = closest_matrix_df.columns[col_idx]

                try:
                    # Calculate walking distance
                    distance = calculate_distance(origin_coords, destination_coords, gmaps)

                    # Replace 1 with calculated distance
                    closest_matrix_df.iat[row_idx, col_idx] = distance
                except ValueError as e:
                    print(f"Error parsing coordinates at row {row_idx}, col {col_idx}: {e}")

    return closest_matrix_df

In [48]:
# Iterate over each city DataFrame
for city, df in city_data.items():
    # Print the dimensions of the current DataFrame
    print(f"Dimensions of {city} DataFrame: {df.shape}")

Dimensions of Antwerpen DataFrame: (1733, 2488)
Dimensions of Brugge DataFrame: (405, 1257)
Dimensions of Brussels DataFrame: (3968, 4866)
Dimensions of Charleroi DataFrame: (1067, 1396)
Dimensions of Gent DataFrame: (950, 1877)
Dimensions of Leuven DataFrame: (318, 827)
Dimensions of Liege DataFrame: (1528, 2548)
Dimensions of Oostende DataFrame: (297, 459)


In [56]:
### VERY IMPORTANT, DO NOT RUN BEFORE YOU
### ARE SURE THE CITIES YOU SELECTED WILL NOT 
### RESULT IN OVER 40,000 API REQUESTS

##For example, from above we can see Brugge has 405 rows (cards), 
# so it will result in 405x10=4050 api requests
# (I think 40,000 is the limit but I could be wrong)

##only need brussels and liege now
selected_cities = ['Brussels', 'Liege']

# Iterate over each selected city DataFrame
for city in selected_cities:
    # Check if the city exists in your city_data dictionary
    if city in city_data:
        # Get the DataFrame for the current city
        df = city_data[city]
        
        # Replace all 1 entries with calculated distances for the current city DataFrame
        closest_matrix_with_distances = replace_1_with_distances(df, gmaps)
        
        # Print the resulting matrix for the current city
        print(f"Closest matrix with distances for {city}:")
        print(closest_matrix_with_distances)
        
        # Write the resulting DataFrame to a CSV file
        output_filename = f"{city}_cost_matrix.csv"
        closest_matrix_with_distances.to_csv(output_filename)
        print(f"Saved the closest matrix with distances for {city} to {output_filename}")
    else:
        print(f"Data not available for {city}")

Closest matrix with distances for Brussels:
                            (50.8428222, 4.3843075)  (50.8830175, 4.3331515)  \
50.85301, 4.37361                            1000.0                   1000.0   
50.86908, 4.39587                            1000.0                   1000.0   
50.84311, 4.35044                            1000.0                   1000.0   
50.797928222, 4.3977615838                   1000.0                   1000.0   
50.86992, 4.37161                            1000.0                   1000.0   
...                                             ...                      ...   
50.86423, 4.35747                            1000.0                   1000.0   
50.86043, 4.31955                            1000.0                   1000.0   
50.851767492, 4.3949560914                   1000.0                   1000.0   
50.84402162, 4.3621103327                    1000.0                   1000.0   
50.86583, 4.31208                            1000.0                   1000.0