## Libraries importation

In [None]:
import os
import sys

# Obtenir le répertoire de travail courant
current_dir = os.getcwd()

# Obtenir le répertoire parent de `maps` (qui est `src`)
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

# Ajouter `src` au chemin de recherche des modules
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [None]:
import pandas as pd
import numpy as np

from useful_methods.ihm.plots import plot_graph
from useful_methods.neighbours_delaunay.simple_criteria import distance_criterion, quadrant_criterion, angle_criterion
from useful_methods.neighbours_delaunay.enhanced_criteria import distance_criterion_enhanced, quadrant_criterion_enhanced, angle_criterion_enhanced
from useful_methods.neighbours_delaunay.graphs import delaunay_graph
from city.city_utils import city_detection_enhanced
from useful_methods.data_processing import extract_data
from useful_methods.neighbours_delaunay.miscellaneous_for_neighbouring import mean_distance_to_NN

In [None]:
# Path to where you want the outputs to be saved
out_directory = "../../out/"

In [None]:
mean_distance_params = {
    ']0, 1] km': {'colour': '#030464', 'angle': 40, 'distance': 2},
    ']1, 2] km': {'colour': '#069AF3', 'angle': 30, 'distance': 5},
    ']2, 4] km': {'colour': '#02D4BB', 'angle': 25, 'distance': 10},
    ']4, inf] km': {'colour': '#0DBF75', 'angle': 15, 'distance': 15},
}

## Database import and data extraction

We will focus only on the Normandie region, using Orange 4G base stations.

In [None]:
df = pd.read_csv("../../database/data.csv", sep=";", decimal=",")
df.head()

In [None]:
df_extracted = extract_data(df, provider='Orange', techno='4g', min_info=True)
df_extracted.head()

In [None]:
mean_distances = mean_distance_to_NN(df_extracted[['x', 'y']], n_neighbours=3) # 3 to have more neighbours

In [None]:
for index in df_extracted.index:
    df_extracted.loc[index, 'mean_dist'] = mean_distances[index]

In [None]:
for bs_id in mean_distances.index:
    df_extracted.loc[bs_id, 'countryside'] = 1 if(mean_distances[bs_id]>2) else 0

df_extracted['countryside'] = df_extracted['countryside'].astype(int)

In [None]:
df_extracted.head()

In [None]:
df_meanDistance_dep = pd.DataFrame(columns=['nom_dep', 'city', 'countryside', 'total'])

In [None]:
from sklearn.neighbors import NearestNeighbors
def sum_distance_to_NN(coordsXY: list, n_neighbours: int = 4) -> pd.Series:
    nbrs = NearestNeighbors(n_neighbors=n_neighbours+1, metric='euclidean').fit(coordsXY)  # n_neighbors+1 because considering himself
    #lambda x, y : distance.distance(x[::-1], y[::-1]).km # we use this because less time and precision overall global
    distances, _ = nbrs.kneighbors(coordsXY)
    
    mean_distances = np.sum(distances[:, 1:]/1000, axis=1)  # we exclude the first element (distance to ourself is 0)

    return pd.Series(data=mean_distances, index=coordsXY.index)

In [None]:
for dep in df_extracted['nom_dep'].unique():
    df_tmp_city = df_extracted.loc[((df_extracted['nom_dep']==dep) & (df_extracted['countryside']==0))]
    tmp_dist_city = sum_distance_to_NN(df_tmp_city[['x', 'y']], n_neighbours=len(df_tmp_city)-1) if(len(df_tmp_city) > 1) else np.array([-1])

    df_tmp_coun = df_extracted.loc[((df_extracted['nom_dep']==dep) & (df_extracted['countryside']==1))]
    tmp_dist_coun = sum_distance_to_NN(df_tmp_coun[['x', 'y']], n_neighbours=len(df_tmp_coun)-1) if(len(df_tmp_coun) > 1) else np.array([-1])

    df_tmp_tota = df_extracted.loc[(df_extracted['nom_dep']==dep)]
    tmp_dist_tota = sum_distance_to_NN(df_tmp_tota[['x', 'y']], n_neighbours=len(df_tmp_tota)-1)

    dep_distance_city = np.round(tmp_dist_city.mean(), decimals=5)
    dep_distance_coun = np.round(tmp_dist_coun.mean(), decimals=5)
    dep_distance = np.round(tmp_dist_tota.mean(), decimals=5)

    df_new_row = pd.DataFrame(data=np.array([[dep,dep_distance_city,dep_distance_coun,dep_distance]]), columns=['nom_dep','city','countryside','total'])
    df_meanDistance_dep = pd.concat([df_meanDistance_dep, df_new_row], ignore_index=True)
df_meanDistance_dep[['city','countryside','total']] = df_meanDistance_dep[['city','countryside','total']].astype(float)

In [None]:
# for dep in df_extracted['nom_dep'].unique():
#     dep_distance_city = np.round(df_extracted.loc[((df_extracted['nom_dep']==dep) & (df_extracted['countryside']==0)), 'mean_dist'].mean(), decimals=5)
#     dep_distance_coun = np.round(df_extracted.loc[((df_extracted['nom_dep']==dep) & (df_extracted['countryside']==1)), 'mean_dist'].mean(), decimals=5)
#     dep_distance = np.round(df_extracted.loc[(df_extracted['nom_dep']==dep), 'mean_dist'].mean(), decimals=5)

#     df_new_row = pd.DataFrame(data=np.array([[dep,dep_distance_city,dep_distance_coun,dep_distance]]), columns=['nom_dep','city','countryside','total'])
#     df_meanDistance_dep = pd.concat([df_meanDistance_dep, df_new_row], ignore_index=True)
# df_meanDistance_dep[['city','countryside','total']] = df_meanDistance_dep[['city','countryside','total']].astype(float)

In [None]:
df_meanDistance_dep

### Recovery of area and population data from Internet data

In [None]:
## Data import
from bs4 import BeautifulSoup
import requests

url = "https://france.ousuisje.com/departements/classement/superficie.php"

reponse = requests.get(url)
html_doc = reponse.text
soup = BeautifulSoup(html_doc, 'html.parser')
table_informations = soup.find('table', summary="Classement des départements français par superficie")
# On recherche le header
header = table_informations.find('thead')

# On trouve toutes les lignes du tableau
rows = table_informations.find('tbody').find_all('tr')

# Extraction des colonnes
column_names = [th.text.strip() for th in header.find_all('th')]

In [None]:
## Dataframe Creation

# Création d'une liste vide qui contiendra les données de chaque départements
data = []

# Extraction de chaques données
for row in rows:
    row_data = [td.text.strip() for td in row.find_all('td')]
    data.append(row_data)

# Création du Dataframe
df_dep = pd.DataFrame(data, columns=column_names)

df_dep.head()

### DataFrame improvment

In [None]:
df_dep[['nom_dep', 'insee_dep', 'nom_reg']] = df_dep['Département / No / Région'].str.split('\n\n|\n \[|\]\n', expand=True)
# Suppression des parenthèses dans la colonne 'code_dep'
df_dep['insee_dep'] = df_dep['insee_dep'].str.replace('(', '').str.replace(')', '', regex = False).astype(str)

# Suppression du crochet dans la colonne 'nom_region'
# df_dep['nom_reg'] = df_dep['nom_reg'].str.replace(']', '', regex = False)
df_dep.drop(['Département / No / Région','nom_reg'], axis=1, inplace=True)

df_dep['insee_dep'] = df_dep['insee_dep'].str.lstrip(' ') #.str.lstrip('0')

df_dep = df_dep.drop(df_dep[(df_dep['insee_dep'] == '973') | (df_dep['insee_dep'] == '972') | (df_dep['insee_dep'] == '974') | (df_dep['insee_dep'] == '976') | (df_dep['insee_dep'] == '971')].index)

# display(df_dep.head())

for code in df_dep['insee_dep']:
    df_dep.loc[df_dep['insee_dep']==code, 'nom_dep'] = df.loc[df['insee_dep']==code, 'nom_dep'].iloc[0]
df_dep.drop('insee_dep', axis=1, inplace=True)

display(df_dep.head())

### Digitization of added columns

In [None]:
df_dep['Superficie [en km²]'] = df_dep['Superficie [en km²]'].str.replace('\xa0', '')
df_dep['Population'] = df_dep['Population'].str.replace('\xa0', '')
df_dep['Densite [hab. /km²]'] = df_dep['Densite [hab. /km²]'].str.replace('\xa0', '')

In [None]:
display(df_dep.head())

In [None]:
departments = list(np.unique(df_extracted["nom_dep"]))
nb_dep = len(departments)

df_proPerDep = pd.DataFrame({
    "nom_dep" : departments,
    "Total" : [int for i in range(nb_dep)]
})

for dep in departments: # number of sites per department per provider
    df_proPerDep.loc[df_proPerDep["nom_dep"]==dep, "Total"] = list(df_extracted["nom_dep"]).count(dep)

### ... Bidouillage

In [None]:
for dep in df_meanDistance_dep['nom_dep']:
    tmp = df_meanDistance_dep.loc[df_meanDistance_dep['nom_dep']==dep, 'countryside'].astype(float).values / df_dep.loc[df_dep['nom_dep']==dep, 'Population'].astype(int).values #(df_proPerDep.loc[df_proPerDep["nom_dep"]==dep, "Total"].astype(int).values * df_dep.loc[df_dep['nom_dep']==dep, 'Superficie [en km²]'].astype(int).values)
    df_meanDistance_dep.loc[df_meanDistance_dep['nom_dep']==dep, 'normalized'] = np.round(tmp[0] * 1000, decimals=5)

In [None]:
df_meanDistance_dep.sort_values(by=['total'])

In [None]:
df_meanDistance_dep.loc[df_meanDistance_dep['nom_dep']=='Charente-Maritime']

In [None]:
results = open(out_directory + "meanDistDeptClas.md", "w")

results.write(df_meanDistance_dep.sort_values(by=['normalized']).to_markdown(index=False) + "\n")

results.close()