In [30]:
import pandas as pd
from tqdm.notebook import tqdm
from rapidfuzz import process

In [31]:
import os
import sys

src_dir = os.path.join(os.getcwd(), "..", "..")
from importlib import reload
sys.path.append(os.path.abspath(src_dir))

import src.helpers.city_to_train_station

reload(src.helpers.city_to_train_station)

from src.helpers.city_to_train_station import VisualizerHelper

In [27]:
train_station_df = pd.read_csv('../data/liste-des-gares.csv', sep=';')
cities_df = pd.read_csv('../data/cities.csv')


city_names_df = cities_df[["label", "latitude", "longitude", "region_name", "department_name"]].copy()
city_names_df.rename(columns={"region_name": "region", "department_name": "department"}, inplace=True)
city_names_df["class_name"] = "city"

train_station_names_df = pd.DataFrame(train_station_df["LIBELLE"]).copy()
train_station_names_df.rename(columns={"LIBELLE": "label"}, inplace=True)
train_station_names_df[["latitude", "longitude"]] = train_station_df["C_GEO"].str.split(',', expand=True).astype(float)
train_station_names_df["class_name"] = "train_station"

In [28]:
all_names_df = pd.concat([city_names_df, train_station_names_df], ignore_index=True)

In [29]:
helper = VisualizerHelper()

with tqdm(total=all_names_df.loc[all_names_df["class_name"] == "city"].shape[0], unit="name") as pbar:
    for row in all_names_df.loc[all_names_df["class_name"] == "city"].iterrows():
        nearest_train_station = helper.get_nearest_station(row[1]["label"]).LIBELLE.values[0]
        all_names_df.loc[all_names_df["label"] == row[1]["label"], "nearest_train_station"] = nearest_train_station
        
        pbar.update(1)

KeyError: 'class_name'

In [12]:
def sanitize_string(string):
    string = string.lower()
    string = string.replace("-", " ")
    return string

In [10]:
all_names_df["label"] = sanitize_string(all_names_df["label"])
all_names_df["region"] = sanitize_string(all_names_df["region"])
all_names_df["department"] = sanitize_string(all_names_df["department"])

all_names_df = all_names_df.drop(
    all_names_df.loc[
        (all_names_df["class_name"] == "city") & (all_names_df["nearest_train_station"].isna())
    ].index
)
all_names_df.loc[all_names_df["class_name"] == "train_station", "nearest_train_station"] = "-"


In [13]:
def fuzzy_search(name):
    name = name.lower().replace("-", " ")
    result_df = []

    result_label = process.extractOne(name, all_names_df["label"].values)
    result_region = process.extractOne(name, all_names_df["region"].values)
    result_department = process.extractOne(name, all_names_df["department"].values)

    result_df.append(
        {"name": result_label[0], "score": result_label[1], "class_name": "city"}
    )

    result_df.append(
        {"name": result_region[0], "score": result_region[1], "class_name": "region"}
    )
    
    result_df.append(
        {"name": result_department[0], "score": result_department[1], "class_name": "department"}
    )

    result_df = pd.DataFrame(result_df)

    if not result_df.empty:
        best_result = result_df.loc[result_df["score"].idxmax()]
        return best_result["name"]
    else:
        return None

In [26]:
fuzzy_search("paris gare de lyon")

'paris gare de lyon'

In [11]:
all_names_df.to_csv("../data/cities_and_train_stations.csv", index=False)