### Nous allons ici faire quelques applications d'augmentation des données textuelles en suivant un tutoriel sur medium 

In [None]:
!pip install nlpaug
!pip install sacremoses

Version de PyTorch qui sera compatible avec CUDA 12.2 que nous avons utilisé 

In [8]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121


Found existing installation: torch 2.6.0+cpu
Uninstalling torch-2.6.0+cpu:
  Successfully uninstalled torch-2.6.0+cpu
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.5.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
Collecting torchvision==0.20.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-linux_x86_64.whl (7.3 MB)
Collecting torchaudio==2.5.1
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.

In [9]:
import nlpaug.augmenter.word as naw 


In [10]:
## Un exemple de texte qu'on veut reformuler 
text = "The quick brown fox jumps over a lazy dog"

In [4]:
import os, nltk

# 1) Dossier local et inscriptible pour les données NLTK
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DIR)

# 2) Téléchargements nécessaires
# - wordnet + omw-1.4 : pour les synonymes
# - averaged_perceptron_tagger_eng : POS tagger (NLTK 3.8+)
# - averaged_perceptron_tagger : par compatibilité descendante (certaines libs le demandent encore)
# - punkt : tokenisation de base (utile selon les tokenizers)
for pkg in ["wordnet", "omw-1.4", "averaged_perceptron_tagger_eng",
            "averaged_perceptron_tagger", "punkt"]:
    try:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download failed for {pkg}: {e}")


#### Synonym Replacement 

In [5]:
syn_aug = naw.synonym.SynonymAug(aug_src="wordnet")
synonym_text = syn_aug.augment(text)
print("Synonym Text: ", synonym_text)

Synonym Text:  ['The quick robert brown fox bound over a lazy wienerwurst']


#### Random Substitution

In [6]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print("Substituted Text: ", substituted_text)

Substituted Text:  ['_ quick brown _ jumps over a lazy _']


### Random Deletion

In [7]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print("Deletion Text: ", deletion_text)

Deletion Text:  ['The jumps over a lazy dog']


### Random Swap

In [8]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print("Swap Text: ", swap_text)

Swap Text:  ['Quick the brown jumps fox over lazy a dog']


### Back Translation

Translate original text to other language (like french) and convert back to english language

In [3]:
# on utilise un modèle de traduction anglais-français et français-anglais
# pour reformuler le texte en anglais
import nlpaug.augmenter.word as naw
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

back_trans_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-fr',
    to_model_name='Helsinki-NLP/opus-mt-fr-en',
    device=device,
    max_length=300
)

back_trans_text = back_trans_aug.augment(text)
print("Back Translated Text: ", back_trans_text)

Back Translated Text:  ['The fast brown fox jumps on a lazy dog']


In [1]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

src_text = ["Syndromic surveillance in plant health is essential."]
inputs = tokenizer(src_text, return_tensors="pt", padding=True)
translated = model.generate(**inputs)
print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])


  from .autonotebook import tqdm as notebook_tqdm


['La surveillance syndromique de la santé des plantes est essentielle.']


### Nous allons appliquer la Rétrotraduction pour former notre premier jeu de données augmenté 
Nous allons appliquer cela sur les données de texte brute ensuite on fera encore le nettoyage, nous allons appliquer l'augmentation uniquement pour les articles de type VS qui est sous représenté

In [23]:
import os, nltk

# Dossier local pour les données NLTK (avec droits d’écriture)
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DIR)

# Paquets requis (NLTK 3.8+)
for pkg in [
    "punkt",                         # tokeniseur historique
    "punkt_tab",                     # depuis NLTK 3.8
    "stopwords",
    "wordnet", "omw-1.4",
    "averaged_perceptron_tagger_eng",
    "averaged_perceptron_tagger",    # compat descente
]:
    try:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download failed for {pkg}: {e}")


#### Essayons ici de rattraper le nombre d'articles de NVS pour plus d'équilibre au travers de la rétro-traduction 

In [2]:
import pandas as pd
import nlpaug.augmenter.word as naw
from typing import Optional, List
import torch, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
import os
import math

os.environ["TOKENIZERS_PARALLELISM"] = "false"


# ==== Init NLTK stuff ====
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stopwords = {'et', 'al'}

# ==== Utils ====
def coerce_to_str(x) -> str:
    if isinstance(x, list):
        x = " ".join(map(str, x))
    elif pd.isna(x):
        x = ""
    return str(x).strip()

def safe_augment(aug, text: str, n: int = 1) -> List[str]:
    """Retourne une liste de n paraphrases (peut être < n si le modèle échoue)."""
    if not text:
        return []
    try:
        out = aug.augment(text, n=n)  # peut renvoyer str ou list[str]
        if isinstance(out, str):
            out = [out]
        return [t.strip() for t in out if isinstance(t, str) and t.strip()]
    except Exception:
        return []

def nettoyer_texte_tokens(texte: str) -> List[str]:
    tokens = word_tokenize(texte)
    tokens_nettoyes = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\s+', '', token)
        token = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ]', '', token)
        if token and token not in stop_words and token not in punctuation and token not in custom_stopwords:
            token = lemmatizer.lemmatize(token)
            tokens_nettoyes.append(token)
    return tokens_nettoyes

def tokens_to_text(tokens: List[str]) -> str:
    return " ".join(tokens)

# ==== Device & models ====
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device choisi :", device)

from_model, to_model = (
    ("facebook/wmt19-en-fr", "facebook/wmt19-fr-en") if device == "cuda"
    else ("Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en")
)

back_trans_aug = naw.BackTranslationAug(
    from_model_name=from_model,
    to_model_name=to_model,
    device=device,
    batch_size=32 if device == "cuda" else 8,
    max_length=256
)

# ==== Load data ====
df = pd.read_csv("./data/data_final_phase2_private.csv")
df["text"] = df["text"].apply(coerce_to_str)

# (Re)crée text_clean / token_clean si manquants
if "token_clean" not in df.columns:
    df["token_clean"] = df["text"].apply(nettoyer_texte_tokens)
else:
    mask = df["token_clean"].isna()
    df.loc[mask, "token_clean"] = df.loc[mask, "text"].apply(nettoyer_texte_tokens)

if "text_clean" not in df.columns:
    df["text_clean"] = df["token_clean"].apply(tokens_to_text)
else:
    mask = df["text_clean"].isna()
    df.loc[mask, "text_clean"] = df.loc[mask, "token_clean"].apply(tokens_to_text)

# ==== Filtrer uniquement la classe VS ====

mask_vs = df["type_article"].astype(str).str.upper().eq("VS")
df_vs = df[mask_vs].copy()



# --- Comptage ---
count_vs  = (df["type_article"].astype(str).str.upper() == "VS").sum()
count_nvs = (df["type_article"].astype(str).str.upper() == "NVS").sum()
print("NVS:", count_nvs, " VS:", count_vs)

target = count_nvs
needed = max(0, target - count_vs)
if needed == 0:
    print("Pas besoin d'augmentation: VS est déjà ≥ NVS.")

# --- Génération ---
aug_rows = []
if needed > 0:
    factor = math.ceil(target / count_vs)  # nb total de versions par article (original compris)
    print(f"Chaque article VS doit produire à peu près {factor} versions (dont l'original).")

    for _, row in df_vs.iterrows():
        raw = row["text"]
        aug_texts = safe_augment(back_trans_aug, raw[:2000], n=max(1, factor-1))
        # construire les lignes
        for aug_text in aug_texts:
            tokens_bt = nettoyer_texte_tokens(aug_text)
            aug_rows.append({
                "text_src": raw,
                "text_final": aug_text,
                "source": "bt",
                "token_clean": tokens_bt,
                "text_clean": tokens_to_text(tokens_bt),
                "type_article": row["type_article"],
                "thematique": row.get("thematique", "")
            })

# DataFrame des augmentées
aug_df = pd.DataFrame(aug_rows)

# --- Déduplication robuste ---
# 1) enlever les lignes vides/NaN
aug_df = aug_df[aug_df["text_final"].astype(str).str.strip().ne("")].dropna(subset=["text_final"])

# 2) dédupliquer par paraphrase (et étiquette) pour éviter redites exactes
aug_df = aug_df.drop_duplicates(subset=["text_final", "type_article"], keep="first")

# 3) si trop de lignes, échantillonner pour viser exactement "needed"
if len(aug_df) > needed:
    # Utilisons random_sate pour controler l'aléa 
    aug_df = aug_df.sample(n=needed, random_state=42).reset_index(drop=True)
elif len(aug_df) < needed:
    print(f"Seulement {len(aug_df)} paraphrases uniques générées, < needed={needed}.")
  

print("Articles VS générés (uniques) :", len(aug_df))
print("Total VS (original + aug) :", count_vs + len(aug_df))
print("Total NVS :", count_nvs)

# --- Bloc original & concat finale ---
train_original = df.copy()
train_original["text_src"]   = train_original["text"]
train_original["text_final"] = train_original["text"]
train_original["source"]     = "orig"

cols = ["text_final", "text_clean", "token_clean", "source", "type_article", "thematique", "text_src"]

def ensure_list(x):
    if isinstance(x, list): return x
    if isinstance(x, str):  return x.split()
    return []

train_original = train_original.reindex(columns=cols, fill_value="")
train_original["token_clean"] = train_original["token_clean"].apply(ensure_list)
train_original["text_clean"]  = train_original["text_clean"].astype(str)

train_aug = aug_df.reindex(columns=cols, fill_value="")

train_data = pd.concat([train_original, train_aug], ignore_index=True)
print("Nb lignes original :", len(df))
print("Nb VS augmentées   :", len(aug_df))
print("Taille finale       :", train_data.shape)


  from .autonotebook import tqdm as notebook_tqdm


Device choisi : cuda


OSError: facebook/wmt19-en-fr is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [1]:
import torch
print("Torch:", torch.__version__)
print("CUDA dispo:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Nom du GPU:", torch.cuda.get_device_name(0))


Torch: 2.5.1+cu121
CUDA dispo: True
GPU count: 1
Nom du GPU: NVIDIA RTX 4500 Ada Generation


In [5]:
train_data

Unnamed: 0,text_final,text_clean,token_clean,source,type_article,thematique,text_src
0,Microbial Community Composition Associated wit...,microbial community composition associated pot...,"[microbial, community, composition, associated...",orig,VS,SV,Microbial Community Composition Associated wit...
1,Plant Pathogenic and Endophytic Colletotrichum...,plant pathogenic endophytic colletotrichum fru...,"[plant, pathogenic, endophytic, colletotrichum...",orig,VS,SV,Plant Pathogenic and Endophytic Colletotrichum...
2,Lethal Bronzing: What you should know about th...,lethal bronzing know disease turn palm tree br...,"[lethal, bronzing, know, disease, turn, palm, ...",orig,VS,SV,Lethal Bronzing: What you should know about th...
3,Leaffooted Bug Damage in Almond Orchards Leaff...,leaffooted bug damage almond orchard leaffoote...,"[leaffooted, bug, damage, almond, orchard, lea...",orig,VS,SV,Leaffooted Bug Damage in Almond Orchards Leaff...
4,Kebbi govt battles mysterious disease affectin...,kebbi govt battle mysterious disease affecting...,"[kebbi, govt, battle, mysterious, disease, aff...",orig,VS,SV,Kebbi govt battles mysterious disease affectin...
...,...,...,...,...,...,...,...
2734,"In the case of Alabama, mysterious seeds shoul...",case alabama mysterious seed represent agricul...,"[case, alabama, mysterious, seed, represent, a...",bt,VS,SV,Mystery Seed Packages Appearing Once Again in ...
2735,According to the Ministry of Agriculture and I...,according ministry agriculture industry alabam...,"[according, ministry, agriculture, industry, a...",bt,VS,SV,ACES: Mystery seed packages appearing once aga...
2736,"In the case of farmers, there is an unpreceden...",case farmer unprecedented increase price fresh...,"[case, farmer, unprecedented, increase, price,...",bt,VS,SV,Farmers Blame Unknown Pest As Pepper Hits ₦150...
2737,A sharp drop in yield as the mysterious fungal...,sharp drop yield mysterious fungal infection a...,"[sharp, drop, yield, mysterious, fungal, infec...",bt,VS,SV,Sharp decline in yield as mysterious fungal in...


In [36]:
train_data["text_final"].iloc[2735] == train_data["text_src"].iloc[2735]

False

In [47]:
train_data["type_article"].value_counts()

type_article
NVS    2241
VS      496
Name: count, dtype: int64

In [48]:
df["type_article"].value_counts()

type_article
NVS    2241
VS      249
Name: count, dtype: int64

#### Les textes augmentées par la rétro traduction ont une source bt 

In [6]:
## Sauvegardons les données augmentées dans un fichier csv 

train_data.to_csv("./data/data_augmented_back_translation.csv", index= False)

###  Nous allons reprendre la classification avec le fine-tuning de SBERT que nous avons fait à la phase 3 

voir le fichier notebook *fine_tuning_experiments_augmentated.ipynb*

In [2]:
import pandas as pd 
data = pd.read_csv("./data/data_augmented_back_translation.csv")

In [3]:
data

Unnamed: 0,text_final,text_clean,token_clean,source,type_article,thematique,text_src
0,Microbial Community Composition Associated wit...,microbial community composition associated pot...,"['microbial', 'community', 'composition', 'ass...",orig,VS,SV,Microbial Community Composition Associated wit...
1,Plant Pathogenic and Endophytic Colletotrichum...,plant pathogenic endophytic colletotrichum fru...,"['plant', 'pathogenic', 'endophytic', 'colleto...",orig,VS,SV,Plant Pathogenic and Endophytic Colletotrichum...
2,Lethal Bronzing: What you should know about th...,lethal bronzing know disease turn palm tree br...,"['lethal', 'bronzing', 'know', 'disease', 'tur...",orig,VS,SV,Lethal Bronzing: What you should know about th...
3,Leaffooted Bug Damage in Almond Orchards Leaff...,leaffooted bug damage almond orchard leaffoote...,"['leaffooted', 'bug', 'damage', 'almond', 'orc...",orig,VS,SV,Leaffooted Bug Damage in Almond Orchards Leaff...
4,Kebbi govt battles mysterious disease affectin...,kebbi govt battle mysterious disease affecting...,"['kebbi', 'govt', 'battle', 'mysterious', 'dis...",orig,VS,SV,Kebbi govt battles mysterious disease affectin...
...,...,...,...,...,...,...,...
2732,Mystery Seed Packages Appearing Once Again in ...,mystery seed package appearing alabama mystery...,"['mystery', 'seed', 'package', 'appearing', 'a...",bt,VS,SV,Mystery Seed Packages Appearing Once Again in ...
2733,ACES: Mystery seed packages appeared again in ...,ace mystery seed package appeared alabama ace ...,"['ace', 'mystery', 'seed', 'package', 'appeare...",bt,VS,SV,ACES: Mystery seed packages appearing once aga...
2734,"Farmers blame the plague: 150,000 per bag Farm...",farmer blame plague per bag farmer blame plagu...,"['farmer', 'blame', 'plague', 'per', 'bag', 'f...",bt,VS,SV,Farmers Blame Unknown Pest As Pepper Hits ₦150...
2735,Sharp drop in yield due to mysterious fungal i...,sharp drop yield due mysterious fungal infecti...,"['sharp', 'drop', 'yield', 'due', 'mysterious'...",bt,VS,SV,Sharp decline in yield as mysterious fungal in...


In [4]:
data["type_article"].value_counts()

type_article
NVS    2241
VS      496
Name: count, dtype: int64