In [61]:
import pandas as pd
from tqdm.notebook import tqdm
from rapidfuzz import process

In [42]:
import os
import sys

src_dir = os.path.join(os.getcwd(), "..", "..")
from importlib import reload
sys.path.append(os.path.abspath(src_dir))

import src.helpers.city_to_train_station

reload(src.helpers.city_to_train_station)

from src.helpers.city_to_train_station import  CityToTrainHelper

In [43]:
train_station_df = pd.read_csv('../data/liste-des-gares.csv', sep=';')
cities_df = pd.read_csv('../data/cities.csv')


city_names_df = cities_df[["label", "latitude", "longitude", "region_name", "department_name"]].copy()
city_names_df.rename(columns={"region_name": "region", "department_name": "department"}, inplace=True)
city_names_df["class_name"] = "city"

train_station_names_df = pd.DataFrame(train_station_df["LIBELLE"]).copy()
train_station_names_df.rename(columns={"LIBELLE": "label"}, inplace=True)
train_station_names_df[["latitude", "longitude"]] = train_station_df["C_GEO"].str.split(',', expand=True).astype(float)
train_station_names_df["class_name"] = "train_station"

In [47]:
all_names_df = pd.concat([city_names_df, train_station_names_df], ignore_index=True)
all_names_df = all_names_df.drop(
    all_names_df.loc[all_names_df["latitude"].isna()].index
)

In [48]:
helper = CityToTrainHelper()

with tqdm(total=all_names_df.loc[all_names_df["class_name"] == "city"].shape[0], unit="name") as pbar:
    for row in all_names_df.loc[all_names_df["class_name"] == "city"].iterrows():
        nearest_train_station = helper.calculate_nearest_station(row[1]["label"]).LIBELLE.values[0]
        all_names_df.loc[all_names_df["label"] == row[1]["label"], "nearest_train_station"] = nearest_train_station
        
        pbar.update(1)

  0%|          | 0/38934 [00:00<?, ?name/s]

In [54]:
def sanitize_string(column):
    column = column.str.lower()
    column = column.str.replace("-", " ")
    return column

In [55]:
all_names_df["label"] = sanitize_string(all_names_df["label"])
all_names_df["region"] = sanitize_string(all_names_df["region"])
all_names_df["department"] = sanitize_string(all_names_df["department"])

all_names_df.loc[all_names_df["class_name"] == "train_station", "nearest_train_station"] = "-"

In [58]:
def fuzzy_search(name):
    name = name.lower().replace("-", " ")
    result_df = []

    result_label = process.extractOne(name, all_names_df["label"].values)
    result_region = process.extractOne(name, all_names_df["region"].values)
    result_department = process.extractOne(name, all_names_df["department"].values)

    result_df.append(
        {"name": result_label[0], "score": result_label[1], "class_name": "city"}
    )

    result_df.append(
        {"name": result_region[0], "score": result_region[1], "class_name": "region"}
    )
    
    result_df.append(
        {"name": result_department[0], "score": result_department[1], "class_name": "department"}
    )

    result_df = pd.DataFrame(result_df)

    if not result_df.empty:
        best_result = result_df.loc[result_df["score"].idxmax()]
        return best_result["name"]
    else:
        return None

In [59]:
fuzzy_search("paris gare de lyon")

'paris gare de lyon'

In [60]:
all_names_df.to_csv("../data/cities_and_train_stations.csv", index=False)

In [82]:
pd.DataFrame(
    all_names_df.loc[all_names_df["region"] == "île de france"]
        .groupby("nearest_train_station")
        .value_counts()
        .sort_values(ascending=False)
        .head(20)
)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,count
nearest_train_station,label,latitude,longitude,region,department,class_name,Unnamed: 7_level_1
Lamballe,lamballe armor,48.485735,-2.47299,bretagne,côtes d'armor,city,9
Plénée-Jugon,le mene,48.282633,-2.53119,bretagne,côtes d'armor,city,7
St-Malo,st malo,48.640049,-1.980865,bretagne,ille et vilaine,city,5
Gérard,rives du couesnon,48.284928,-1.345743,bretagne,ille et vilaine,city,4
Pontrieux-Halte,la roche jaudy,48.731682,-3.233237,bretagne,côtes d'armor,city,4
Belle-Isle-Bégard,begard,48.634783,-3.291535,bretagne,côtes d'armor,city,4
Pleine-Fougères,val couesnon,48.440079,-1.462682,bretagne,ille et vilaine,city,4
Plénée-Jugon,jugon les lacs commune nouvelle,48.406111,-2.326902,bretagne,côtes d'armor,city,4
Pontivy,bon repos sur blavet,48.223995,-3.1236,bretagne,côtes d'armor,city,3
Dinan,lanvallay,48.451577,-2.01149,bretagne,côtes d'armor,city,3
