In [1]:
import os


os.chdir('/Users/carolinarutilidelima/Documents/GitHub/co-pilot-mind/data/transcriptions/temp/multi')

In [2]:
import json
import re
from collections import defaultdict
import spacy
from spacy.matcher import Matcher


In [3]:
# Load a multilingual spaCy model
nlp = spacy.load("xx_ent_wiki_sm")


In [5]:
# Load the transcript JSON (adjust the filename as needed)
with open("sod-gru_mixed_transcription_with_turbo_model_multi.json", "r", encoding="utf-8") as file:
    data = json.load(file)


In [6]:
# Combine the text from all segments
full_text = " ".join(segment["text"] for segment in data.get("segments", []))
doc = nlp(full_text)


In [7]:
# Initialize the Matcher
matcher = Matcher(nlp.vocab)

In [8]:
# --- Define Patterns ---

# ORIGIN patterns
# Portuguese: "a aeronave de [Origin]"
pattern_origin_pt = [
    {"LOWER": "aeronave"},
    {"LOWER": "de"},
    {"IS_TITLE": True, "OP": "+"}
]
# English alternative (if transcript uses 'from' or 'origin')
pattern_origin_en = [
    {"LOWER": {"IN": ["from", "origin"]}},
    {"IS_TITLE": True, "OP": "+"}
]
matcher.add("ORIGIN", [pattern_origin_pt, pattern_origin_en])

# DESTINATION patterns
# Portuguese: "para [Destination]"
pattern_destination_pt = [
    {"LOWER": "para"},
    {"IS_TITLE": True, "OP": "+"}
]
# English alternative: "to [Destination]" or "destination [Destination]"
pattern_destination_en = [
    {"LOWER": {"IN": ["to", "destination"]}},
    {"IS_TITLE": True, "OP": "+"}
]
matcher.add("DESTINATION", [pattern_destination_pt, pattern_destination_en])

In [11]:
# AIRSPEED patterns (e.g., "Airspeed VF de 111")
pattern_airspeed_en = [
    {"LOWER": "airspeed"},
    {"LOWER": "vf"},
    {"LOWER": {"IN": ["of"]}, "OP": "?"},
    {"LIKE_NUM": True}
]
pattern_airspeed_pt = [
    {"LOWER": "airspeed"},
    {"LOWER": "vf"},
    {"LOWER": {"IN": ["de"]}, "OP": "?"},
    {"LIKE_NUM": True}
]
matcher.add("AIRSPEED", [pattern_airspeed_en, pattern_airspeed_pt])

# VERTICAL SPEED patterns (e.g., "vertical speed 300" or "velocidade vertical 300")
pattern_vertical_speed_en = [
    {"LOWER": "vertical"},
    {"LOWER": {"IN": ["speed"]}, "OP": "?"},
    {"LIKE_NUM": True}
]
pattern_vertical_speed_pt = [
    {"LOWER": "velocidade"},
    {"LOWER": "vertical", "OP": "?"},
    {"LIKE_NUM": True}
]
matcher.add("VERTICAL_SPEED", [pattern_vertical_speed_en, pattern_vertical_speed_pt])

# CRUISE SPEED patterns (e.g., "cruise speed 450" or "velocidade de cruzeiro 450")
pattern_cruise_speed_en = [
    {"LOWER": "cruise"},
    {"LOWER": "speed"},
    {"LOWER": {"IN": ["of"]}, "OP": "?"},
    {"LIKE_NUM": True}
]
pattern_cruise_speed_pt = [
    {"LOWER": "velocidade"},
    {"LOWER": "de"},
    {"LOWER": "cruzeiro"},
    {"LOWER": {"IN": ["de"]}, "OP": "?"},
    {"LIKE_NUM": True}
]
matcher.add("CRUISE_SPEED", [pattern_cruise_speed_en, pattern_cruise_speed_pt])


# ALTITUDE patterns (e.g., "altitude 8000 pés" or "altitude 8000 feet")
pattern_altitude_en = [
    {"LOWER": "altitude"},
    {"LIKE_NUM": True},
    {"LOWER": {"IN": ["feet", "ft"]}, "OP": "?"}
]
pattern_altitude_pt = [
    {"LOWER": {"IN": ["altitude", "altura"]}},
    {"LOWER": {"IN": ["de"]}, "OP": "?"},
    {"LIKE_NUM": True},
    {"LOWER": {"IN": ["pés", "ft"]}, "OP": "?"}
]
matcher.add("ALTITUDE", [pattern_altitude_en, pattern_altitude_pt])

# Run the matcher on the document
matches = matcher(doc)


In [13]:
# Prepare a dictionary to hold the extracted data
extracted_info = {
    "origin": None,
    "destination": None,
    "airspeed": None,
    "vertical_speed": None,
    "cruise_speed": None,
    "altitude": None,
}


# Process the matches
for match_id, start, end in matches:
    span = doc[start:end]
    label = nlp.vocab.strings[match_id]
    
    if label in ["AIRSPEED", "VERTICAL_SPEED", "CRUISE_SPEED", "ALTITUDE"]:
        # For numeric values, pick the first number found in the span
        for token in span:
            if token.like_num:
                extracted_info[label.lower()] = token.text
                break
    elif label in ["ORIGIN", "DESTINATION"]:
        # For location-based patterns, take the last token (assumed to be the location)
        extracted_info[label.lower()] = span[-1].text

# Print the extracted flight information
print("Extracted Flight Information:")
for key, value in extracted_info.items():
    print(f"{key.capitalize()}: {value}")

Extracted Flight Information:
Origin: Sorocaba
Destination: Guarulhos
Airspeed: None
Vertical_speed: None
Cruise_speed: None
Altitude: None
