In [None]:
# import libraries
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from geopy.distance import geodesic

In [None]:
# import mutations
mutations = pd.read_csv("BDD_output/mutations.csv")

In [None]:
# coordinates in a good format for the function of distance computation
mutations['coor_parcelles'] = mutations['coor_parcelles'].apply(
    lambda x: [float(coord) for coord in x.strip('[]').split(',')] if isinstance(x, str) else None
)

In [None]:
# import des min_dist to determine the size we want for each variables
# we look at the average and the median for each variables
min_dist = pd.read_csv("BDD_output/min_dist.csv")
colonnes_min_dist = min_dist.filter(regex='^min_dist')
moyenne_min_dist = colonnes_min_dist.mean()
median_min_dist = colonnes_min_dist.median()

print(moyenne_min_dist)
print(median_min_dist)

min_dist_mat_elem      153.921041
min_dist_coll_lycee    267.558682
min_dist_cine          665.211886
min_dist_spectacle     292.546787
min_dist_velib         117.832220
min_dist_station       226.757181
min_dist_site_tour     446.863061
min_dist_park          185.516756
dtype: float64
min_dist_mat_elem      142.214213
min_dist_coll_lycee    254.168670
min_dist_cine          602.764395
min_dist_spectacle     243.298950
min_dist_velib         114.049274
min_dist_station       212.305858
min_dist_site_tour     410.193725
min_dist_park          168.890455
dtype: float64


# Distance Function

In [None]:
# fonction to compute the number of one variable in a perimeter defined by lat and long
# inputs: nom_var (name of the new variable we creat), nom_position (name of the column of coordinates in dataset)
def compute_var_dist(dataset, delta_lat, delta_long, nom_var, nom_position):
    mutations[f'nb_{nom_var}'] = np.zeros(len(mutations), dtype=int)

    mutations_coords = np.array(mutations['coor_parcelles'].tolist())

    for index, row_point in tqdm(dataset.iterrows(), total=len(dataset)):
        point_coords = np.array(list(row_point[nom_position]))

        lat_diff = np.abs(mutations_coords[:, 0] - point_coords[0])
        long_diff = np.abs(mutations_coords[:, 1] - point_coords[1])

        lat_condition = lat_diff <= delta_lat
        long_condition = long_diff <= delta_long

        mutations[f'nb_{nom_var}'] += (lat_condition & long_condition).astype(int)

# Schools

In [None]:
ecole = pd.read_csv("BDD_input/les_etablissements_d_enseignement_des_1er_et_2d_degres_en_idf.csv", sep=';')

In [None]:
# filter on departments in Paris and next to Paris (departements 75, 92, 93, et 94)
ecole = ecole[ecole["Code département"].isin([75, 92, 93, 94])]

columns_to_keep = ['Appellation officielle', 'Patronyme uai', 'Secteur Public/Privé', 'Adresse','Latitude', 'Longitude', 'Code commune', 'Position', 'Nature']
ecole = ecole[columns_to_keep]

ecole['Position'] = ecole['Position'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

ecole = ecole[ecole["Nature"].isin(['ECOLE DE NIVEAU ELEMENTAIRE', 'ECOLE MATERNELLE', 'COLLEGE', 'LYCEE D ENSEIGNEMENT GENERAL'])]

# 5 schools whithout position --> delete these rows
ecole = ecole.dropna()

### Primary and elementary school

In [None]:
ecole_mat_elem = ecole[ecole['Nature'].isin(['ECOLE DE NIVEAU ELEMENTAIRE', 'ECOLE MATERNELLE'])]

In [None]:
lat = 200
long = 300
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.0018018018018018018 0.002702702702702703


In [None]:
nom_position = 'Position'

nom_var = "mat_elem"
ecole_mat_elem = ecole[ecole['Nature'].isin(['ECOLE DE NIVEAU ELEMENTAIRE', 'ECOLE MATERNELLE'])]
compute_var_dist(ecole_mat_elem, delta_lat, delta_long, nom_var, nom_position)

nom_var = "mat_elem_prive"
ecole_mat_elem_prive = ecole_mat_elem[ecole_mat_elem['Secteur Public/Privé'].isin(['Privé'])]
compute_var_dist(ecole_mat_elem_prive, delta_lat, delta_long, nom_var, nom_position)

nom_var = "mat_elem_public"
ecole_mat_elem_public = ecole_mat_elem[ecole_mat_elem['Secteur Public/Privé'].isin(['Public'])]
compute_var_dist(ecole_mat_elem_public, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 3050/3050 [00:05<00:00, 559.68it/s]
100%|██████████| 432/432 [00:00<00:00, 657.50it/s]
100%|██████████| 2618/2618 [00:04<00:00, 649.95it/s] 


### Middle school and high school

In [None]:
ecole_coll_lycee = ecole[ecole['Nature'].isin(['COLLEGE', 'LYCEE D ENSEIGNEMENT GENERAL'])]

In [None]:
lat = 300
long = 400
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.002702702702702703 0.0036036036036036037


In [None]:
nom_var = "coll_lycee"
ecole_coll_lycee= ecole[ecole['Nature'].isin(['COLLEGE', 'LYCEE D ENSEIGNEMENT GENERAL'])]
compute_var_dist(ecole_coll_lycee, delta_lat, delta_long, nom_var, nom_position)

nom_var = "coll_lycee_prive"
ecole_coll_lycee_prive = ecole_coll_lycee[ecole_coll_lycee['Secteur Public/Privé'].isin(['Privé'])]
compute_var_dist(ecole_coll_lycee_prive, delta_lat, delta_long, nom_var, nom_position)

nom_var = "coll_lycee_public"
ecole_coll_lycee_public = ecole_coll_lycee[ecole_coll_lycee['Secteur Public/Privé'].isin(['Public'])]
compute_var_dist(ecole_coll_lycee_public, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 798/798 [00:00<00:00, 1329.66it/s]
100%|██████████| 329/329 [00:00<00:00, 1372.59it/s]
100%|██████████| 469/469 [00:00<00:00, 1494.56it/s]


# Cinema

In [None]:
cinema = pd.read_csv("BDD_input/les_salles_de_cinemas_en_ile-de-france.csv", sep = ';')
cinema = cinema[cinema["Département"].isin([75, 92, 93, 94])]
cinema['geo'] = cinema['geo'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

In [None]:
lat = 800
long = 900
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.007207207207207207 0.008108108108108109


In [None]:
cinema["fauteuils"].median()

448.0

In [None]:
nom_position = "geo"
nom_var = "cine"
compute_var_dist(cinema, delta_lat, delta_long, nom_var, nom_position)

nom_var = "cine_plus200"
cinema_plus200 = cinema[cinema['fauteuils']>=200]
compute_var_dist(cinema_plus200, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 187/187 [00:00<00:00, 1415.51it/s]
100%|██████████| 154/154 [00:00<00:00, 1239.66it/s]


# Live show performance site

In [None]:
theatre = pd.read_csv("BDD_input/les-lieux-de-diffusion-reguliere-ou-occasionnelle-du-spectacle-vivant-a-paris.csv", sep = ';')
theatre['WGS84'] = theatre['WGS84'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else x)

In [None]:
lat = 300
long = 400
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.002702702702702703 0.0036036036036036037


In [None]:
nom_position = "WGS84"
nom_var = "spectacle"
compute_var_dist(theatre, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 386/386 [00:00<00:00, 1274.51it/s]


# Amenities/shops

In [None]:
commerce = pd.read_csv("BDD_input/commerces.csv", sep=';')

  commerce = pd.read_csv(chemin_general + "BDD_input/commerces.csv", sep=';')


In [None]:
# filter on paris and surrounding departments and last update >= 2023
commerce = commerce[commerce['com_insee'].astype(str).str.startswith(('75', '92', '93', '94'))]
commerce['last_update'] = pd.to_datetime(commerce['last_update'])
commerce = commerce[commerce['last_update'] >= '2023-01-01'] #same date as the mutations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commerce['last_update'] = pd.to_datetime(commerce['last_update'])


In [None]:
# Keep only interesting types of commerce: see documentation of openstreetmap
# https://wiki.openstreetmap.org/wiki/FR:Key:shop
food_markets = ['alcohol','bakery','beverages', 'brewing_supplies','butcher','cheese','chocolate','coffee','confectionery','convenience','deli','dairy','farm','frozen_food','greengrocer','health_food','ice_cream','pasta','pastry','seafood','spices','tea','wine','water','food', 'department_store','general','kiosk','mall','supermarket','wholesale']
fashion_shop = ['baby_goods','bag','boutique','clothes','fabric','fashion','fashion_accessories','jewelry','leather','sewing','shoes','shoe_repair','tailor','watches','wool']
culture_shop = ['art','camera','collector','craft','frame','games','model','music','musical_instrument','photo','trophy','video','video_games']
catering = ['bar','cafe','fast_food','food_court','ice_cream','pub','restaurant']
all_types = food_markets + fashion_shop + culture_shop + catering

commerce = commerce[commerce['type'].isin(all_types)]

In [None]:
commerce['coor'] = commerce.apply(lambda row: (row['Y'], row['X']), axis=1)

In [None]:
lat = 200
long = 300
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.0018018018018018018 0.002702702702702703


In [None]:
nom_position = "coor"

nom_var = "food_market"
commerce_food_market = commerce[commerce['type'].isin(food_markets)]
compute_var_dist(commerce_food_market, delta_lat, delta_long, nom_var, nom_position)

nom_var = "fashion_shop"
commerce_fashion_shop = commerce[commerce['type'].isin(fashion_shop)]
compute_var_dist(commerce_fashion_shop, delta_lat, delta_long, nom_var, nom_position)

nom_var = "culture_shop"
commerce_culture_shop = commerce[commerce['type'].isin(culture_shop)]
compute_var_dist(commerce_culture_shop, delta_lat, delta_long, nom_var, nom_position)

nom_var = "catering"
commerce_catering = commerce[commerce['type'].isin(catering)]
compute_var_dist(commerce_catering, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 5467/5467 [00:04<00:00, 1104.61it/s]
100%|██████████| 2698/2698 [00:02<00:00, 1263.50it/s]
100%|██████████| 409/409 [00:00<00:00, 1317.17it/s]
100%|██████████| 9660/9660 [00:07<00:00, 1285.68it/s]


# Velibs

In [None]:
f = open("BDD_input/stations-velib.json")

jdata = json.load(f)

velib = pd.DataFrame.from_dict(jdata.get('data').get('stations'), orient='columns')
velib.drop('rental_methods', axis=1, inplace=True)
velib[['station_id','stationCode']] = velib[['station_id','stationCode']].astype(str)

In [None]:
velib['geolocalisation'] = velib.apply(lambda row: f"({row['lat']}, {row['lon']})", axis=1)
velib['geolocalisation'] = velib['geolocalisation'].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [None]:
lat = 150
long = 250
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.0013513513513513514 0.0022522522522522522


In [None]:
mutations['nb_station_velib'] = 0
mutations['nb_velib'] = 0

mutations_coords = np.array(mutations['coor_parcelles'].tolist())

for index_velib, row_velib in tqdm(velib.iterrows(), total=len(velib)):
    com_coords = np.array(list(row_velib['geolocalisation']))
    capacity = row_velib['capacity']

    lat_diff = np.abs(mutations_coords[:, 0] - com_coords[0])
    long_diff = np.abs(mutations_coords[:, 1] - com_coords[1])

    lat_condition = lat_diff <= delta_lat
    long_condition = long_diff <= delta_long

    mutations['nb_station_velib'] += (lat_condition & long_condition).astype(int)
    mutations['nb_velib'] += (lat_condition & long_condition).astype(int) * capacity

100%|██████████| 1471/1471 [00:01<00:00, 856.88it/s]


# Metro and tram stations

In [None]:
gares = pd.read_csv("BDD_input/emplacement-des-gares-idf.csv", sep = ';')

In [None]:
gares['Geo Point'] = gares['Geo Point'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

In [None]:
lat = 300
long = 400
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.002702702702702703 0.0036036036036036037


In [None]:
mutations['nb_gare_unique'] = 0
mutations['nb_lignes_unique'] = 0

mutations_coords = np.array(mutations['coor_parcelles'].tolist())

# Do a loop on mutation this time
for index_mutation, row_mutation in tqdm(mutations.iterrows(), total=len(mutations)):
    gare_coords = np.array(gares['Geo Point'].tolist())
    lat_diff = np.abs(row_mutation['coor_parcelles'][0] - gare_coords[:, 0])
    long_diff = np.abs(row_mutation['coor_parcelles'][1] - gare_coords[:, 1])

    lat_condition = lat_diff <= delta_lat
    long_condition = long_diff <= delta_long

    gares_filtrees = gares[lat_condition & long_condition]

    # Count the number of unique stations and lines
    mutations.at[index_mutation, 'nb_gare_unique'] = gares_filtrees['nom_long'].nunique()
    mutations.at[index_mutation, 'nb_lignes_unique'] = gares_filtrees['res_com'].nunique()

100%|██████████| 114363/114363 [01:29<00:00, 1279.17it/s]


# Touristic sites

In [None]:
site_tour = pd.read_csv("BDD_input/principaux-sites-touristiques-en-ile-de-france0.csv", sep = ";")

In [None]:
site_tour = site_tour[site_tour["insee"].astype(str).str.startswith(('75', '92', '93', '94'))]

In [None]:
site_tour['Geo Point'] = site_tour['Geo Point'].apply(lambda x: tuple(map(float, x.split(','))) if isinstance(x, str) else None)

In [None]:
lat = 500
long = 600
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.0045045045045045045 0.005405405405405406


In [None]:
nom_position = "Geo Point"
nom_var = "site_tour"
compute_var_dist(site_tour, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 379/379 [00:00<00:00, 1317.33it/s]


# Proximity to center of Paris

central point of paris:48.85658439033624, 2.3426097576956035

In [None]:
paris_center = (48.85658439033624, 2.3426097576956035)

In [None]:
# calcul de la distance au centre de paris en km
compute_distance = lambda row: geodesic(paris_center, row).meters
mutations['distance_to_center'] = mutations['coor_parcelles'].apply(compute_distance)

# Green spaces and Parks

In [None]:
parcs = pd.read_csv("BDD_input/espaces_verts.csv", sep = ";")

In [None]:
# delete rows
rows_to_delete = ["Décorations sur la voie publique", "Murs végétalisés", "Cimetières"]
parcs = parcs[~parcs["Typologie d'espace vert"].isin(rows_to_delete)]

In [None]:
parcs = parcs.dropna(subset=["Geo Shape"])

In [None]:
def extract_first_point(coordinates):
    if isinstance(coordinates, str):
        coordinates_dict = json.loads(coordinates)
        if coordinates_dict["type"] == "Polygon":
            return coordinates_dict["coordinates"][0][0]
        elif coordinates_dict["type"] == "MultiPolygon":
            return coordinates_dict["coordinates"][0][0][0]
    return None

parcs["Geo point"] = parcs["Geo Shape"].apply(lambda x: extract_first_point(x))

In [None]:
parcs["Geo point"] = parcs["Geo point"].apply(lambda coordinates: [coordinates[1], coordinates[0]])

In [None]:
lat = 200
long = 300
delta_lat = lat/111000
delta_long = long/111000
print(delta_lat,delta_long)

0.0018018018018018018 0.002702702702702703


In [None]:
nom_position = "Geo point"
nom_var = "parc"
compute_var_dist(parcs, delta_lat, delta_long, nom_var, nom_position)

100%|██████████| 1141/1141 [00:00<00:00, 1425.09it/s]


# Save in a CSV

In [None]:
mutations.to_csv("BDD_output/nb_distance.csv", index=False)