# Reduction Step: Locations

In [1]:
# Import packages
import spacy
from geopy.geocoders import Nominatim
import pandas as pd
import time
import re
import unicodedata
from geopy.exc import GeocoderServiceError, GeocoderTimedOut

## Pipeline 2

As this Pipeline is for this step way simpler, we take it first.

In [None]:
# Load data
df = pd.read_excel("../databases/Titles_le_Temps.xlsx")
df['text'] = df['Title'].fillna('') + " " + df['Post_Lead'].fillna('')
list = df["text"].to_list()

In [None]:
df_suisse = df[df['Link'].str.startswith('https://www.letemps.ch/suisse/')]
df_suisse.to_excel('../databases/Pipeline2_locations_CH.xlsx')

## Pipeline 1

Loading of the french language model and definition of formula to normalize strings. 

In [None]:
nlp = spacy.load("fr_core_news_md")

def normalize_str(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)  #breaks the characters in its base and attached accents
        if unicodedata.category(c) != 'Mn' #filters out all nonspacing marks, such as accents
    ).lower()

Formula to extract the location from a text. 

In [None]:
def extraire_lieu(texte):
    doc = nlp(texte)
    lieux = [ent.text for ent in doc.ents if ent.label_ == "LOC"]
    return lieux

The formulas defining if a place is inside of Switzerland (as a location >= village), or if it is a place outside of Switzerland (as a location >= town). Those two limits of geographical entities has been choosen, as otherwise they would always find a small geographical entity that carries the name we are looking for.  

In [None]:
failed_locations = []

def verifier_suisse(lieu):
    time.sleep(1) 
    geolocator = Nominatim(user_agent="geo_checker")

    try:
        location = geolocator.geocode(lieu, addressdetails=True, timeout=7)
    except (GeocoderServiceError, GeocoderTimedOut) as e:
        failed_locations.append(lieu)
        return False

    if not location or "address" not in location.raw:
        failed_locations.append(lieu)
        return False

    address = location.raw["address"]

    # Check if the location is in Switzerland
    if address.get("country_code", "").lower() != "ch":
        return False

    valid_categories = {
        "village", "town", "municipality", "city", "county", "district", "state", "region", "country"
    }

    lieu_normalized = normalize_str(lieu)

    for key in valid_categories:
        value = address.get(key)
        if value and lieu_normalized == normalize_str(value):
            return True


    if lieu.strip().lower() == "de":
        return False

    return key in valid_categories


def verifier_entite_geographique(lieu):
    time.sleep(1)
    geolocator = Nominatim(user_agent="geo_entity_checker")

    try:
        location = geolocator.geocode(lieu, addressdetails=True, timeout=7)
    except Exception:
        return False

    if not location or "address" not in location.raw:
        return False

    blocked_words = {"terre", "monde", "earth", "planète", "univers", "place", "town", "municipality", "city", "county", "district", "state", "region", "country"}
    if normalize_str(lieu) in blocked_words:
        return False


    address = location.raw["address"]

    valid_categories = {"town", "municipality", "city", "county", "district", "state", "region", "country"}

    lieu_normalized = normalize_str(lieu)

    for key in valid_categories:
        value = address.get(key)
        if value and lieu_normalized == normalize_str(value):
            return True

    return False

The formula that classifies the places into the categories 0 (Not Switzerland), 1 (Switzerland), 2 (Not defined). Before this, it re-normalizes the words / strings so that when comparing with the other it would say it to be the same (eg. Genève => geneve). 

In [None]:
def classify_lieux(lst):
    if not lst:
        return 2

    def get_words(text):
        clean = normalize_str(text)  
        parts = re.split(r"[’']", clean) 
        return [word for part in parts for word in part.split()]

    words = [word for item in lst for word in get_words(item)]

    if any(word in [s.lower() for s in Suisse] for word in words):
        return 1
    elif all(word in [n.lower() for n in Not_Suisse] for word in words):
        return 0
    else:
        return 2

For reasons of computational time, it was decided to first extract all places of the different texts, make a list with all places that are inside, outside of Switzerland and then check for each title if they contain places in one of the two strings. 

In [None]:
# Load data
test = pd.read_excel("../databases/Titles_le_Temps_LP.xlsx")
test['text'] = test['Title'].fillna('') + " " + test['Post_Lead'].fillna('')
list = test["text"].to_list()

In [None]:
# Extract places
lieux_extraits = [extraire_lieu(str(text)) for text in list]

flattened_set = set(
    word
    for sublist in lieux_extraits
    for item in sublist
    for phrase in item.split()
    for word in re.split(r"[’']", phrase)
)

In [None]:
# Fill the two lists with places of Switzerland and outside Switzerland
Suisse = []
Not_Suisse = []

for place in list(flattened_set):
    if verifier_suisse(place):
        Suisse.append(place)
    else:
        Not_Suisse.append(place)

In [None]:
# Check if the places that arn't found in switzerland are geographical entities
flattened_list_NCH = set(Not_Suisse)

Not_Suisse_final = []
rest = []

for lieu in flattened_list_NCH:
    if verifier_entite_geographique(lieu) == True:
        Not_Suisse_final.append(lieu)
    else:
        rest.append(lieu)

### Classification of the places into the category 0 (Not Switzerland), 1 (Switzerland), 2 (Not defined)

In [None]:
# Extract the places of the texts and classify them
test['lieux'] = [extraire_lieu(str(text)) for text in test['text']]

test['Classification'] = test['lieux'].apply(classify_lieux)

locations_CH = test[test['Classification'] != 0]
locations_CH.to_excel('shelfens/Pipeline1_locations_CH.xlsx')