# Component One: Voice Recorder

In [1]:
import os
import wave
import json

import numpy as np
import sounddevice
from scipy.io.wavfile import write
from vosk import Model, KaldiRecognizer, SetLogLevel

## Step 1: Record Voice

This component will record an audio sample, and save it to an audio file.

### Process
1. Run the cell below
2. Input a number for the amount of time you want the recorder to run
3. (Maybe) grant access for the use of your devices microphone
4. Speak
5. The cell will automatically stop running after the time is completed

### Details
- The number of channels is specific for your device
- The function creates a new file or rewrites the existing one

In [2]:
fs= 44100
second =  int(input("Enter time duration in seconds: "))
print("Recording.....n")
record_voice = sounddevice.rec( int ( second * fs ) , samplerate = fs , channels = 1, dtype=np.int16 ) # might be different depending on machine
sounddevice.wait()
write("audio_file.wav",fs,record_voice)
print("Finished.....nPlease check your output file")

Recording.....n
Finished.....nPlease check your output file


## Step 2: Transcribe Audio

This component takes an audio file and writes its content to a text file

### Process
1. Download the vosk model. The linto french model is suggested
2. Run the cell below

### Details
- the model must be downloaded and have the appropriate name
- the file names are provided at the beginning of the cell

In [4]:
import os
from icecream import ic

model_name = "linto"
audio_file = './audio_file.wav'
text_file = 'transcription.txt'

model_path = "../../app/app/back/nlp/models/{}".format(model_name)
ic(os.getcwd())

if not os.path.exists(model_path):
    print("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
    exit(1)

wf = wave.open(audio_file, "rb")

if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print ("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model(model_path)
rec = KaldiRecognizer(model, wf.getframerate())
# rec.SetMaxAlternatives(10)
# rec.SetWords(True)

result = []
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        result.append(json.loads(rec.Result()))

# ret = [sentence["alternatives"][0]["text"] for sentence in result]
ret = result[0]["text"]

with open(text_file, 'w') as file:
    file.write(ret)
file.close()

ic| os.getcwd(): '/Users/ryanheadley/epitech/tor_2021_3/docs/jupyter-components'
LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 1 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 2 orphan components.
LOG (VoskAPI:Collapse():nnet-utils.cc:1488) Added 1 components, removed 2
LOG (VoskAPI:CompileLooped():nnet-compile-looped.cc:345) Spent 0.0880129 seconds in looped compilation.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from ../../app/app/back/nlp/models/linto/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:278) Loading HCLG from ../../app/app/back/nlp/models/linto/gr

In [7]:
with open(text_file, 'r') as f:
    print(f.readlines())
f.close()

['deux aller de paris à lyon en train']


# Component Two: Natural Language Processing

In [5]:
from langdetect import detect
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import spacy
from nltk import word_tokenize

This component takes a text file and extracts a travel request destination and departure

### Process
1. Run the cell below

### Details
- the model must be downloaded and have the appropriate name

In [8]:
text_file = 'transcription.txt'
model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("fr_core_news_md")

words_before_departure = ['de', 'depuis', 'provenance']
words_before_destination = ['à', 'a', 'en', 'jusqu\'a', 'vers', 'par']
example_travel_sentence = ['Je veux prendre un train de paris à lyon']

def get_cities(sentence):
    """ Take a sentence and return all cities within

    Args:
        sentence (str): any sentence

    Returns:
        Array: A list of cities
    """
    cities = []
    doc = nlp(sentence)
    for entity in doc.ents:
        if entity.label_ == "LOC":
            cities.append(entity.text)
    
    return cities

def check_for_travel_request(sentences):
    """ Take a list of sentences and return the sentence
        containing a request to travel by train

    Args:
        sentences (Array<str>): List of sentences

    Returns:
        str: travel request sentence or SPAM
    """
    sentence_embeddings = model.encode(sentences)
    real_sentence_embedding = model.encode(example_travel_sentence)
    similarities = cosine_similarity(
        [real_sentence_embedding[0]],
        sentence_embeddings
    )
    biggest_number = max(similarities[0])
    if biggest_number < 0.75:
        return False
    best_sentence_ind = np.where(similarities[0] == biggest_number)
    return sentences[best_sentence_ind[0][0]]

def get_destination_and_departure():
    """Takes the pre-defined text file and determines 
    the destination and departure

    Returns:
        dict: of destination and departure 
        OR False if text not valid
    """
    # read the text file
    file = open(text_file, "r")
    sentences = file.read()
    file.close()
    if '.' in sentences:
        sentences.split('.')
    else:
        sentences = [sentences]

    if detect(sentences[0]) != 'fr':
        return False
    
    # check for travel request
    request = check_for_travel_request(sentences)
    if not request:
        return False
    
    # get destination and departure
    departure = []
    destination = []
    
    cities = get_cities(request)
    words = word_tokenize(request)
    for city in cities:
        index = words.index(city)
        if index == 0: continue
        if words[index-1] in words_before_departure: departure.append(city)
        elif words[index-1] in words_before_destination: destination.append(city)
    
    return {
        "departure": departure[0],
        "destination": destination[0]
    }
    
journey = get_destination_and_departure()
journey

{'departure': 'paris', 'destination': 'lyon'}

# Component Three: Pathfinder

This component takes a departure and destination, and calculates the fastest path between the nearest train stations to these locations.

### Process
1. Run the cell below
2. Wait until the calulation is done.
---

### Details
1. May take some time to generate the itinary depending on the number of nodes between the **departure** and the **arrival**.

In [9]:
import pandas as pd
import requests
from datetime import datetime
from datetime import timedelta
import pickle
from geopy.geocoders import Nominatim
import mpu

## Step 1: Data Generation:

Generate or load the **train stations** data.

In [10]:
def load_cities():
    cities = None
    f = open("cities.pkl", "rb")
    cities = pickle.load(f)
    f.close()
    if cities is None:
        stops = pd.read_csv('../jupyter-notebook/data/data_sncf/stops.txt', sep=",")
        stops = stops[stops['stop_id'].str.contains('StopPoint:OCETrain')]
        stops = stops.set_index('stop_id').T.to_dict()
        cities = {}

        for city in list(stops.items()):
            cities.update({
                city[0]: {
                    "stop_name": city[1]["stop_name"],
                    "coord": [city[1]["stop_lat"], city[1]["stop_lon"]],
            }})
        cities_file = open("cities.pkl", "wb")
        pickle.dump(cities, cities_file)
        cities_file.close()
    return cities

In [11]:
def load_trips():
    trips = None
    f = open("trips.pkl", "rb")
    trips_tmp = pickle.load(f)
    f.close()

    if trips_tmp is None:
        stop_times = pd.read_csv('./../data/data_sncf/stop_times.csv', sep=",")
        trips = pd.read_csv('./../data/data_sncf/trips.csv', sep=",")

        stop_times = stop_times[stop_times['stop_id'].str.contains('StopPoint:OCETrain')]

        trips = trips.drop(labels=["service_id", "block_id", "shape_id", "trip_headsign"], axis=1)

        trips = trips.set_index('trip_id').T.to_dict()

        trips_tmp = {}
        for trip in list(trips.items()):
            trips_tmp.update({trip[0]:{"nodes": []}})
            selected_stop_times = stop_times.loc[stop_times['trip_id'] == trip[0]]
            for trip_tmp in selected_stop_times.iterrows():
                trips_tmp[trip[0]]["nodes"].append({"trip_id": trip[0], "stop_id": trip_tmp[1]["stop_id"],"arrival_time": trip_tmp[1]["arrival_time"]})
    trips_file = open("trips.pkl", "wb")
    pickle.dump(trips_tmp, trips_file)
    trips_file.close()
    return trips_tmp

Generate or load the **graph**:

In [12]:
def load_graph():
    routes_graph = None
    f = open("graph.pkl", "rb")
    routes_graph = pickle.load(f)
    f.close()

    trips_tmp = load_trips()
    cities = load_cities()

    if routes_graph is None:
        routes_graph = {}
        for route in list(trips_tmp.items()):
            for i in range (0, len(route[1]["nodes"])):
                city_id = route[1]["nodes"][i]["stop_id"]
                city = cities[city_id]
                if city_id not in routes_graph:
                    routes_graph.update({city_id: []})
                    if i != 0:
                        routes_graph[city_id].append({route[1]["nodes"][i - 1]["stop_id"]: get_duration_node_to_node(route[1]["nodes"][i - 1]["arrival_time"], route[1]["nodes"][i]["arrival_time"])})
                    if i != len(route[1]["nodes"]) - 1:
                        routes_graph[city_id].append({route[1]["nodes"][i + 1]["stop_id"]: get_duration_node_to_node(route[1]["nodes"][i]["arrival_time"], route[1]["nodes"][i + 1]["arrival_time"])})
                elif route[1]["nodes"][i - 1]["stop_id"] not in routes_graph[city_id] and i != 0:
                    routes_graph[city_id].append({route[1]["nodes"][i - 1]["stop_id"]: get_duration_node_to_node(route[1]["nodes"][i - 1]["arrival_time"], route[1]["nodes"][i]["arrival_time"])})
                elif i < len(route[1]["nodes"]) - 1 and route[1]["nodes"][i + 1]["stop_id"] not in routes_graph[city_id]:
                    routes_graph[city_id].append({route[1]["nodes"][i + 1]["stop_id"]: get_duration_node_to_node(route[1]["nodes"][i]["arrival_time"], route[1]["nodes"][i + 1]["arrival_time"])})
    graph_file = open("graph.pkl", "wb")
    pickle.dump(routes_graph, graph_file)
    graph_file.close()
    return routes_graph

## Step 2: Using the Generated Data:

Function for **duration** of travel between **two nodes**:

In [13]:
def get_duration_node_to_node(origin, dest):
    start_minute = int(origin.split(":")[1])
    start_hour = int(origin.split(":")[0])

    arrival_minute = int(dest.split(":")[1])
    arrival_hour = int(dest.split(":")[0])

    start_delta = timedelta(
        seconds=0,
        minutes=start_minute,
        hours=start_hour,
     )

    arrival_delta = timedelta(
        seconds=0,
        minutes=arrival_minute,
        hours=arrival_hour,
     )

    return arrival_delta - start_delta

In [14]:
s1 = '23:05:00'
s2 = '25:10:00'

get_duration_node_to_node(s1,s2)

datetime.timedelta(seconds=7500)

Get **geolocation** of a **city**:

In [15]:
def get_geolocation(city):
    """Get the geolocation of a city

    Args:
        city (str): name of a city

    Returns:
        Array<geopy.location.Location>: Name, region, department, country, [lat, long, other]
    """
    geolocator = Nominatim(user_agent="travel_request")
    location = geolocator.geocode(city)
    return location

In [16]:
def get_geo_distance(lat1, lon1, lat2, lon2):
    """Calculate the distance between to geographical points

    Args:
        lat_1 (float): first latitude
        long_1 (float): first longitude
        lat_2 (float): second latitude
        long_2 (float): second longitude
    """
    return mpu.haversine_distance((lat1, lon1), (lat2, lon2))

Find the closest **train station** for the given **cities**:

In [17]:
def get_closest_stations(cities):
    """
    Get closest train station for each cities passed
    Args:
        cities (dict): departure and destination

    Returns:
        dict: departure station and destination station
    """
    stops = pd.read_csv('../jupyter-notebook/data/data_sncf/stops.txt') # TODO: fix path for application
    stops = stops[stops['stop_id'].str.contains('StopPoint:OCETrain')]
    stop_times = pd.read_csv('../jupyter-notebook/data/data_sncf/stop_times.txt')
    geo_departure = get_geolocation(cities["departure"])
    geo_destination = get_geolocation(cities["destination"])

    departure = {
        "current_lat": geo_departure.latitude,
        "current_lon": geo_departure.longitude,
        "stop": "",
        "distance": 99999,
        "arrival_time": 0
    }
    destination = {
        "current_lat": geo_destination.latitude,
        "current_lon": geo_destination.longitude,
        "stop": "",
        "distance": 99999,
        "arrival_time": 0
    }
    for index, row in stops.iterrows():
        distance_to_departure = get_geo_distance(
            departure["current_lat"],
            departure["current_lon"],
            row.stop_lat,
            row.stop_lon
        )
        if distance_to_departure < departure["distance"]:
            departure["stop"] = row.stop_id
            departure["distance"] = distance_to_departure

        distance_to_destination = get_geo_distance(
            destination["current_lat"],
            destination["current_lon"],
            row.stop_lat,
            row.stop_lon
        )
        if distance_to_destination < destination["distance"]:
            destination["stop"] = row.stop_id
            destination["distance"] = distance_to_destination
    return {departure["stop"]: timedelta(hours=0)}, {destination["stop"]: timedelta(hours=0)}

Convert a **route** to a *list* of **train station**:

In [18]:
def convert_route_to_cities(route):
    cities = load_cities()
    route_cities = []
    for city in route["route"]:
        route_cities.append(cities[list(city)[0]]["stop_name"])
    return route_cities


Convert a **train_station** to **stop point**:

In [19]:
def convert_city_to_stop_points(city):
    stops_tmp = pd.read_csv('../jupyter-notebook/data/data_sncf/stops.txt', sep=",")
    stops_tmp = stops_tmp[stops_tmp['stop_id'].str.contains('StopPoint:OCETrain')]
    stops_tmp = stops_tmp[stops_tmp['stop_name'].str.contains(city)]
    stop_points = []
    for stop in stops_tmp.iterrows():
        stop_points.append({stop[1]["stop_id"]: timedelta(hours=0)})
    return stop_points

**Graph Exploration Function:**

In [20]:
def graph_exploration(graph, start, goal):
    cities = load_cities()
    explored = []
    queue = [[start]]

    if start == goal:
        print("Same Node")
        return
    
    valide_routes = []
    while queue:
        path = queue.pop(0)
        node = path[-1]
        
        if node not in explored and node != goal:
            node_id = list(node.keys())[0]
            neighbours = graph[node_id]
            duration = timedelta(hours=0)
            for neighbour in neighbours:
                if list(neighbour)[0] in cities:
                    new_path = list(path)
                    new_path.append(neighbour)
                    queue.append(new_path)
                    duration = duration + neighbour[list(neighbour)[0]]
                    if list(neighbour)[0] == list(goal)[0]:
                        if len(valide_routes) > 25:
                            return valide_routes
                        valide_routes.append({"route": new_path, "duration": duration})
            explored.append(node)
    return valide_routes

Find the **shortest route**:

In [21]:
def get_shortest_route(routes):
    shortest_duration = routes[0]["duration"]
    shortest_route = routes[0]
    for route in routes:
        if route["duration"] < shortest_duration:
            shortest_duration = route["duration"]
            shortest_route = route
    return shortest_route

In [22]:
station1, station2 = get_closest_stations(journey)

if not station1 and not station2:
        result = "Trajet Impossible"
else:
    graph = load_graph()
    routes = graph_exploration(graph, station1, station2)
    route = get_shortest_route(routes)
    result = "{} en {}".format(" -> ".join(convert_route_to_cities(route)), route["duration"])
print(result)

Gare de Paris-Montparnasse 1-2 -> Gare de Versailles-Chantiers -> Gare de Chartres -> Gare de Voves -> Gare de Auneau -> Gare de Paris-Austerlitz -> Gare de Aubrais-(les) -> Gare de Vierzon -> Gare de Bourges -> Gare de Nevers -> Gare de Moulins-sur-Allier -> Gare de Paray-le-Monial -> Gare de Montchanin -> Gare de Dijon-Ville -> Gare de Lyon-Part-Dieu -> Gare de Lyon-Perrache en 0:08:00
