In [None]:
# import libraries
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from geopy.distance import geodesic

In [None]:
# import mutations
mutations = pd.read_csv("BDD_output/mutations.csv")

In [None]:
# coordinates in format for the calculation of distances
mutations['coor_parcelles'] = mutations['coor_parcelles'].apply(
    lambda x: [float(coord) for coord in x.strip('[]').split(',')] if isinstance(x, str) else None
)

# Distance Function

In [None]:
# function to get the distance of the closest variables
# We use this function to get the closest distance to every variables below
def compute_min_dist(mutations, dataset, delta_lat, delta_long, nom_var, nom_position):
    mutations[f'min_dist_{nom_var}'] = np.zeros(len(mutations), dtype=float)

    dataset_coords = np.array(dataset[nom_position].tolist())

    for index_mut, row_mut in tqdm(mutations.iterrows(), total=len(mutations)):
        mutation_coord = row_mut['coor_parcelles']

        lat_diff = np.abs(dataset_coords[:, 0] - mutation_coord[0])
        long_diff = np.abs(dataset_coords[:, 1] - mutation_coord[1])

        lat_condition = lat_diff <= delta_lat
        long_condition = long_diff <= delta_long

        # Filter dataset coordinates based on conditions
        filtered_dataset_coords = dataset_coords[(lat_condition & long_condition), :]

        if len(filtered_dataset_coords) > 0:
            # Find the index of the closest point satisfying conditions
            min_index = np.argmin([geodesic(mutation_coord, coord).meters for coord in filtered_dataset_coords])
            min_distance = geodesic(mutation_coord, filtered_dataset_coords[min_index]).meters

            mutations.at[index_mut, f'min_dist_{nom_var}'] = min_distance
        else:
            mutations.at[index_mut, f'min_dist_{nom_var}'] = None

# Schools

In [None]:
ecole = pd.read_csv("BDD_input/les_etablissements_d_enseignement_des_1er_et_2d_degres_en_idf.csv", sep=';')

In [None]:
# filter on departments in Paris and next to Paris (departements 75, 92, 93, et 94)
ecole = ecole[ecole["Code département"].isin([75, 92, 93, 94])]

columns_to_keep = ['Appellation officielle', 'Patronyme uai', 'Secteur Public/Privé', 'Adresse','Latitude', 'Longitude', 'Code commune', 'Position', 'Nature']
ecole = ecole[columns_to_keep]

# format for function of computation distances
ecole['Position'] = ecole['Position'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

# to simplify we only keep lycee/college/maternelle/elementaire and we delete all specialized schools (small proportion of the dataset)
ecole = ecole[ecole["Nature"].isin(['ECOLE DE NIVEAU ELEMENTAIRE', 'ECOLE MATERNELLE', 'COLLEGE', 'LYCEE D ENSEIGNEMENT GENERAL'])]

# 5 schools whithout position --> delete these rows
ecole = ecole.dropna()

### Elementary and primary school

We get the minimal distance to a school (in meters)

In [None]:
ecole_mat_elem = ecole[ecole['Nature'].isin(['ECOLE DE NIVEAU ELEMENTAIRE', 'ECOLE MATERNELLE'])]

In [None]:
# Rayon d'environ 1 km
delta_lat = 0.01
delta_long = 0.01
nom_var = "mat_elem"
nom_position = "Position"
compute_min_dist(mutations, ecole_mat_elem, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [15:30<00:00, 122.86it/s]


### High shool and Middle school

In [None]:
ecole_coll_lycee = ecole[ecole['Nature'].isin(['COLLEGE', 'LYCEE D ENSEIGNEMENT GENERAL'])]

In [None]:
# Rayon d'environ 1 km
delta_lat = 0.01
delta_long = 0.01
nom_var = "coll_lycee"
nom_position = "Position"
compute_min_dist(mutations, ecole_coll_lycee, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [05:51<00:00, 325.28it/s]


# Cinema

In [None]:
cinema = pd.read_csv("BDD_input/les_salles_de_cinemas_en_ile-de-france.csv", sep = ';')
cinema = cinema[cinema["Département"].isin([75, 92, 93, 94])]
cinema['geo'] = cinema['geo'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

In [None]:
# Rayon d'environ 1 km
delta_lat = 0.01
delta_long = 0.01
nom_var = "cine"
nom_position = "geo"
compute_min_dist(mutations, cinema, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [02:12<00:00, 860.72it/s]


# Live show performances sites

In [None]:
theatre = pd.read_csv("BDD_input/les-lieux-de-diffusion-reguliere-ou-occasionnelle-du-spectacle-vivant-a-paris.csv", sep = ';')
theatre['WGS84'] = theatre['WGS84'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else x)

In [None]:
# Rayon d'environ 1 km
delta_lat = 0.01
delta_long = 0.01
nom_var = "spectacle"
nom_position = "WGS84"
compute_min_dist(mutations, theatre, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [08:09<00:00, 233.74it/s]


# Velibs

In [None]:
f = open("BDD_input/stations-velib.json")

jdata = json.load(f)

velib = pd.DataFrame.from_dict(jdata.get('data').get('stations'), orient='columns')
velib.drop('rental_methods', axis=1, inplace=True)
velib[['station_id','stationCode']] = velib[['station_id','stationCode']].astype(str)

In [None]:
velib['geolocalisation'] = velib.apply(lambda row: f"({row['lat']}, {row['lon']})", axis=1)
velib['geolocalisation'] = velib['geolocalisation'].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [None]:
# Rayon d'environ 1 km
delta_lat = 0.01
delta_long = 0.01
nom_var = "velib"
nom_position = "geolocalisation"
compute_min_dist(mutations, velib, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [17:58<00:00, 106.02it/s]


# Metro and tram stations

In [None]:
gares = pd.read_csv("BDD_input/emplacement-des-gares-idf.csv", sep = ';')

In [None]:
gares['Geo Point'] = gares['Geo Point'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

In [None]:
# Rayon d'environ 1 km
delta_lat = 0.01
delta_long = 0.01
nom_var = "station"
nom_position = "Geo Point"
compute_min_dist(mutations, gares, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [08:19<00:00, 228.78it/s]


# Touristic sites

In [None]:
site_tour = pd.read_csv("BDD_input/principaux-sites-touristiques-en-ile-de-france0.csv", sep = ";")

In [None]:
site_tour = site_tour[site_tour["insee"].astype(str).str.startswith(('75', '92', '93', '94'))]

In [None]:
site_tour['Geo Point'] = site_tour['Geo Point'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

In [None]:
delta_lat = 0.01
delta_long = 0.01
nom_var = "site_tour"
nom_position = "Geo Point"
compute_min_dist(mutations, site_tour, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [04:44<00:00, 402.02it/s] 


# Proximity to the center of Paris

central point of paris:48.85658439033624, 2.3426097576956035

In [None]:
paris_center = (48.85658439033624, 2.3426097576956035)

In [None]:
# calcul de la distance au centre de paris en km
compute_distance = lambda row: geodesic(paris_center, row).meters
mutations['distance_to_center'] = mutations['coor_parcelles'].apply(compute_distance)

# Green spaces and parks

In [None]:
parcs = pd.read_csv("BDD_input/espaces_verts.csv", sep = ";")

In [None]:
# delete rows
rows_to_delete = ["Décorations sur la voie publique", "Murs végétalisés", "Cimetières"]
parcs = parcs[~parcs["Typologie d'espace vert"].isin(rows_to_delete)]

In [None]:
parcs = parcs.dropna(subset=["Geo Shape"])

In [None]:
def extract_first_point(coordinates):
    if isinstance(coordinates, str):
        coordinates_dict = json.loads(coordinates)
        if coordinates_dict["type"] == "Polygon":
            return coordinates_dict["coordinates"][0][0]
        elif coordinates_dict["type"] == "MultiPolygon":
            return coordinates_dict["coordinates"][0][0][0]
    return None

parcs["Geo point"] = parcs["Geo Shape"].apply(lambda x: extract_first_point(x))

In [None]:
parcs["Geo point"] = parcs["Geo point"].apply(lambda coordinates: [coordinates[1], coordinates[0]])

In [None]:
delta_lat = 0.01
delta_long = 0.01
nom_var = "park"
nom_position = "Geo point"
compute_min_dist(mutations, parcs, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 114363/114363 [16:03<00:00, 118.68it/s]


# Deal with missing values

Some distance have missing values. Therefore, looking at the max distance per variable, we attribute them 1400m.

In [None]:
import pandas as pd

missing_values = mutations.isnull().sum()

# Affichez le récapitulatif des valeurs manquantes
print("Récapitulatif des valeurs manquantes par colonne :")
print(missing_values[missing_values > 0])


Récapitulatif des valeurs manquantes par colonne :
Series([], dtype: int64)


In [None]:
mutations['min_dist_site_tour'].max()

In [None]:
mutations.fillna(1400, inplace=True)

missing_values_after_fillna = mutations.isnull().sum()

print("Récapitulatif des valeurs manquantes par colonne après le remplacement :")
print(missing_values_after_fillna[missing_values_after_fillna > 0])


Récapitulatif des valeurs manquantes par colonne après le remplacement :
Series([], dtype: int64)


# Save in a CSV

In [None]:
mutations.to_csv("BDD_output/min_dist.csv", index=False)