# Reduction Step : Keywords 

In [None]:
# Import packages
import pandas as pd
import re
import math
import spacy
from nltk.corpus import stopwords
from collections import Counter

import time
import requests
from bs4 import BeautifulSoup

In [32]:
# Define relevant formulas and load language models. 
def clean_french_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    words = text.split()
    words = [word for word in words if word not in french_stopwords]
    return ' '.join(words)

nlp = spacy.load("fr_core_news_md")
french_stopwords = set(stopwords.words("french"))

## Creation of list of Keywords

The extraction of keywords was done on base of the datalist of death during police interventions collected manually by Border Forensics. However, this list isn't public. Therefore, you can't find the excel on github. 

In [None]:
# Read the dataset of Border Forensics
df = pd.read_excel("../NotInDatabase/Border_Forensics_Database.xlsx")  
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Extract the French text of it
df["text"] = df["Titre"]+". " + df["Lead_posts"]
df_fr = df[(df["Main language"] == "Fr")]

text = ' '.join(df_fr['text'].dropna().apply(clean_french_text))
doc = nlp(text)

words = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]

# We use counter to see the importance of the different words in the text (how many times they are occuring)
word_freq = Counter(words)

The similarity thresholds of 0.4 for "police" and 0.5 for "death" were selected based on manual inspection of the resulting word lists at various cutoff values. These thresholds appeared to produce the most relevant list of words. 

In [27]:
# Extract the words with similarity higher than 0.4 to the word "police"
police_ref = nlp("police")
police_related_words = {word: count for word, count in word_freq.items()
                        if nlp(word).similarity(police_ref) > 0.4}  
print(police_related_words)

# Extract the words with similarity higher than 0.5 to the word "death"
mort_ref = nlp("mort")
mort_related_words = {word: count for word, count in word_freq.items()
                        if nlp(word).similarity(mort_ref) > 0.5} 
print(mort_related_words)

  if nlp(word).similarity(police_ref) > 0.4}


{'policier': 7, 'instruction': 1, 'police': 11, 'agent': 1, 'défense': 2, 'accusation': 1, 'victime': 1, 'procureur': 2, 'violence': 2, 'fonctionnaire': 1, 'réclusion': 1, 'intervention': 2, 'garde': 2, 'justice': 1, 'militaire': 1, 'prison': 1, 'administratif': 1, 'manifestation': 1, 'interpellation': 1, 'arrestation': 1, 'frontière': 1, 'ministère': 1, 'tribun': 1, 'commissariat': 1, 'affaire': 1, 'plainte': 2, 'enquête': 1, 'secour': 1}


  if nlp(word).similarity(mort_ref) > 0.5}


{'rafle': 1, 'mort': 10, 'décéder': 4, 'décès': 2, 'prison': 1, 'mourir': 2}


The handmade list was made in order ot complete the lists extracted by the texts. It is based on words that were often occuring in the discussion with the expert, can however be completet if needed

In [30]:
lst_handmade = ['tuer', 'cellule', 'arrest', 'officier',  'garde à vue',  'balles', 'tir', 'coup de feu', 'acquitt', 'légitime défense', 'classement', 'plaquage ventral', 'contrôle d’identité', 'détention', 'forces de l’ordre']

In [31]:
list_keywords = set(list(police_related_words.keys()) + list(mort_related_words.keys()) + lst_handmade)
print(list_keywords)

{'classement', 'cellule', 'manifestation', 'accusation', 'police', 'garde', 'commissariat', 'affaire', 'garde à vue', 'arrestation', 'légitime défense', 'interpellation', 'forces de l’ordre', 'instruction', 'plaquage ventral', 'tribun', 'policier', 'balles', 'tir', 'réclusion', 'tuer', 'prison', 'mourir', 'décéder', 'justice', 'victime', 'fonctionnaire', 'militaire', 'ministère', 'violence', 'administratif', 'décès', 'coup de feu', 'détention', 'intervention', 'officier', 'agent', 'défense', 'frontière', 'acquitt', 'rafle', 'procureur', 'secour', 'enquête', 'arrest', 'plainte', 'mort', 'contrôle d’identité'}


## Reduction on those keywords

As the list of border Forensics is confidential, the set of keywords extracted with the code above has been copied direclty inside the code in order for it to work as well in absence of the database of Border Forensics that worked as reference. 

In [33]:
set_keywords = {'administratif', 'interpellation', 'procureur', 'classement', 'tir', 'agent', 'défense', 'mourir', 'décès', 'accusation', 'mort', 'tuer', 'garde à vue', 'enquête', 'intervention', 'militaire', 'détention', 'acquitt', 'justice', 'forces de l’ordre', 'réclusion', 'tribun', 'plainte', 'police', 'arrestation', 'commissariat', 'officier', 'balles', 'violence', 'policier', 'affaire', 'coup de feu', 'arrest', 'contrôle d’identité', 'fonctionnaire', 'légitime défense', 'instruction', 'manifestation', 'garde', 'secour', 'frontière', 'décéder', 'plaquage ventral', 'prison', 'rafle', 'victime', 'ministère', 'cellule'}

In [37]:
def policier_lethal(title, set_keywords):
    doc = nlp(title.lower())
    keywords = set_keywords
    has_police = any(any(kw in token.text for kw in keywords) for token in doc)
    return has_police

In [None]:
# Load scratched Titles
df = pd.read_excel("../databases/Titles_Le_Temps.xlsx")  
df = df[["Title", "Link", "Date"]]

In [38]:
df = df.astype({'Title': 'string'})
df['Datetime'] = pd.to_datetime(df['Date'].astype(str), format='%Y%m')

In [None]:
df['is_police'] = False
df['is_police'] = df['Title'].apply(lambda x: policier_lethal(x, set_keywords))

df_pl = df[df['is_police'] == True]
df_pl.to_excel("../databases/Keyword_extracted.xlsx")

## Adding Lead Posts

The principal of the data scratching was the same as for the titles. In order to make the system a bit strongenr, a sleeping rate has been introduced and if it wasn't working, the code was trying several times to scratch the Lead Post. 

In [None]:
# I think its not the right version
links = df_pl['Link'].to_list()
data = []
missed_data = []

for i in links:
    url = i
    headers = {"User-Agent": "Mozilla/5.0"}

    success = False
    retries = 0
    max_retries = 5  # prevent infinite loops

    while not success and retries < max_retries:
        try:
            response = requests.get(url, headers=headers)
            
            if response.status_code == 429:
                print(f"Rate limited (429) on {url}, retrying...")
                time.sleep(1)
                retries += 1
                continue

            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            post_lead = soup.find(class_="post__lead")
            if post_lead:
                articles = post_lead.select("p")
                for article in articles:
                    lead_post = article.get_text(strip=True)

                    data.append({"Title": lead_post, "Link": i})
            else:
                print(f"No 'lead_post' section found on {url}")

            success = True  # Success: exit loop

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}, retrying...")
            time.sleep(1)
            retries += 1

    if not success:
        print(f"Failed after retries: {url}")
        missed_data.append(url)

Again, the dataframes can be exportet to Excels to keep track of the intermediate steps. This step needed to be repeated several times by replacing 'links' by the links inside the 'missed_data' and merging afterwards the two dataframes. The server couldn't handle that many requests. At then end, we still had  408 / 35553 Titles where no Lead Posts could be attributed. However, some of the articles didn't have Lead Posts, why there couldn't be no Lead Post extracted. 

In [None]:
df = pd.DataFrame(data=data, columns= ["Title", "Link"])
missed_data = pd.DataFrame(data=missed_data, columns= ["Link"])
df.head()

df = df[['Title', 'Link', 'Datetime', 'Post_Lead', ]]
df.to_excel("databases/LeadPosts_Le_Temps.xlsx")