### Nous allons ici faire quelques applications d'augmentation des données textuelles en suivant un tutoriel sur medium 

In [None]:
!pip install nlpaug
!pip install sacremoses

Version de PyTorch qui sera compatible avec CUDA 12.2 que nous avons utilisé 

In [8]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121


Found existing installation: torch 2.6.0+cpu
Uninstalling torch-2.6.0+cpu:
  Successfully uninstalled torch-2.6.0+cpu
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.5.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
Collecting torchvision==0.20.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-linux_x86_64.whl (7.3 MB)
Collecting torchaudio==2.5.1
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.

In [9]:
import nlpaug.augmenter.word as naw 


In [10]:
## Un exemple de texte qu'on veut reformuler 
text = "The quick brown fox jumps over a lazy dog"

In [4]:
import os, nltk

# 1) Dossier local et inscriptible pour les données NLTK
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DIR)

# 2) Téléchargements nécessaires
# - wordnet + omw-1.4 : pour les synonymes
# - averaged_perceptron_tagger_eng : POS tagger (NLTK 3.8+)
# - averaged_perceptron_tagger : par compatibilité descendante (certaines libs le demandent encore)
# - punkt : tokenisation de base (utile selon les tokenizers)
for pkg in ["wordnet", "omw-1.4", "averaged_perceptron_tagger_eng",
            "averaged_perceptron_tagger", "punkt"]:
    try:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download failed for {pkg}: {e}")


#### Synonym Replacement 

In [5]:
syn_aug = naw.synonym.SynonymAug(aug_src="wordnet")
synonym_text = syn_aug.augment(text)
print("Synonym Text: ", synonym_text)

Synonym Text:  ['The quick robert brown fox bound over a lazy wienerwurst']


#### Random Substitution

In [6]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print("Substituted Text: ", substituted_text)

Substituted Text:  ['_ quick brown _ jumps over a lazy _']


### Random Deletion

In [7]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print("Deletion Text: ", deletion_text)

Deletion Text:  ['The jumps over a lazy dog']


### Random Swap

In [8]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print("Swap Text: ", swap_text)

Swap Text:  ['Quick the brown jumps fox over lazy a dog']


### Back Translation

Translate original text to other language (like french) and convert back to english language

In [3]:
# on utilise un modèle de traduction anglais-français et français-anglais
# pour reformuler le texte en anglais
import nlpaug.augmenter.word as naw
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

back_trans_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-fr',
    to_model_name='Helsinki-NLP/opus-mt-fr-en',
    device=device,
    max_length=300
)

back_trans_text = back_trans_aug.augment(text)
print("Back Translated Text: ", back_trans_text)

Back Translated Text:  ['The fast brown fox jumps on a lazy dog']


In [1]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

src_text = ["Syndromic surveillance in plant health is essential."]
inputs = tokenizer(src_text, return_tensors="pt", padding=True)
translated = model.generate(**inputs)
print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])


  from .autonotebook import tqdm as notebook_tqdm


['La surveillance syndromique de la santé des plantes est essentielle.']


### Nous allons appliquer la Rétrotraduction pour former notre premier jeu de données augmenté 
Nous allons appliquer cela sur les données de texte brute ensuite on fera encore le nettoyage, nous allons appliquer l'augmentation uniquement pour les articles de type VS qui est sous représenté

In [23]:
import os, nltk

# Dossier local pour les données NLTK (avec droits d’écriture)
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DIR)

# Paquets requis (NLTK 3.8+)
for pkg in [
    "punkt",                         # tokeniseur historique
    "punkt_tab",                     # depuis NLTK 3.8
    "stopwords",
    "wordnet", "omw-1.4",
    "averaged_perceptron_tagger_eng",
    "averaged_perceptron_tagger",    # compat descente
]:
    try:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download failed for {pkg}: {e}")


#### Essayons ici de rattraper le nombre d'articles de NVS pour plus d'équilibre au travers de la rétro-traduction 

Etant donné le fait que nous vérifions également que le texte produit est unique, il peut arriver que le nombre d'articles produit pour la classe VS n'atteigne pas celui de la classe NVS et qu'on ne reste toujours qu'au double du nombre de VS que nous avons

In [16]:
## Fonction de nettoyage de texte
import re
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stopwords = {'et', 'al'} # Les expressions que nous avons retrouvées en grande quantité lors de nos expérimentations et nous avons décidé de ne pas les conserver 


def nettoyer_texte(texte):
    """
    Nettoie un texte brut :
    2. Tokenisation
    3. Nettoyage caractères spéciaux
    4. Suppression stopwords, ponctuation, custom_stopwords
    5. Lemmatisation
    """
    
    # Étape 2 : tokenisation
    tokens = word_tokenize(texte)

    # Étape 3-6 : nettoyage
    tokens_nettoyes = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\s+', '', token)
        token = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ]', '', token)

        if (
            token
            and token not in stop_words
            and token not in punctuation
            and token not in custom_stopwords
        ):
            token = lemmatizer.lemmatize(token)
            tokens_nettoyes.append(token)

    return tokens_nettoyes

In [31]:
# --- Back-translation simple: 1 VS -> 1 augmentation ---
import os, pandas as pd, torch
import nlpaug.augmenter.word as naw

# Conseils perf / logs
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

# ===== 0) Device & modèles (pivot allemand) =====
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device choisi :", device)

# Essyons d'utiliser l'allemand comme pivot
from_model = "facebook/wmt19-en-de"  # EN -> DE
to_model   = "facebook/wmt19-de-en"  # DE -> EN


back_trans_aug = naw.BackTranslationAug(
    from_model_name=from_model,
    to_model_name=to_model,
    device=device,
    batch_size=32 if device == "cuda" else 8,
    max_length=512 # on limite le nombre de tokens à 512
)

def coerce_to_str(x):
    return "" if pd.isna(x) else str(x).strip()

def safe_bt(text):
    if not text:
        return None
    try:
        out = back_trans_aug.augment(text, n=1)
        return out[0].strip() if isinstance(out, list) and out else (out.strip() if isinstance(out, str) else None)
    except Exception:
        return None

# ===== 1) Charger les données =====
df = pd.read_csv("./data/data_final_phase2_private.csv")
df["text"] = df["text"].apply(coerce_to_str)

# ===== 2) Filtrer VS et générer 1 augmentation par article =====
is_vs = df["type_article"].astype(str).str.upper().eq("VS")
df_vs = df.loc[is_vs].copy()

aug_rows = []
for _, row in df_vs.iterrows():
    src = coerce_to_str(row["text"])[:2000]  # éviter les très longs textes
    bt  = safe_bt(src)
    if bt and bt != src:
        aug_rows.append({
            "text_src": src,
            "text_final": bt,
            "type_article": row["type_article"],
            "thematique": row.get("thematique", ""),
            "source": "bt", 
            "tokens_final": nettoyer_texte(bt),
            "text_cleaned_final": " ".join(nettoyer_texte(bt))
        })

aug_df = pd.DataFrame(aug_rows)
print(f"Augmentations VS générées: {len(aug_df)}")

# ===== 3) Concaténer avec l’original (toutes classes inchangées) =====
orig = df.copy()
orig["text_src"]   = orig["text"]
orig["text_final"] = orig["text"]
orig["source"]     = "orig"
orig["tokens_final"] = orig["text_final"].apply(nettoyer_texte)
orig["text_cleaned_final"] = orig["tokens_final"].apply(lambda tokens: " ".join(tokens))

cols = ["text_final","type_article","thematique","source","text_src", "tokens_final", "text_cleaned_final"]
orig = orig.reindex(columns=cols, fill_value="")
aug_df = aug_df.reindex(columns=cols, fill_value="")

train_data = pd.concat([orig, aug_df], ignore_index=True)
print("Taille finale :", train_data.shape)


Device choisi : cuda
Augmentations VS générées: 249
Taille finale : (2739, 7)


In [8]:
import torch
print("Torch:", torch.__version__)
print("CUDA dispo:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Nom du GPU:", torch.cuda.get_device_name(0))


Torch: 2.5.1+cu121
CUDA dispo: True
GPU count: 1
Nom du GPU: NVIDIA RTX 4500 Ada Generation


In [13]:
df["type_article"].value_counts()

type_article
NVS    2241
VS      249
Name: count, dtype: int64

In [32]:
train_data

Unnamed: 0,text_final,type_article,thematique,source,text_src,tokens_final,text_cleaned_final
0,Microbial Community Composition Associated wit...,VS,SV,orig,Microbial Community Composition Associated wit...,"[microbial, community, composition, associated...",microbial community composition associated pot...
1,Plant Pathogenic and Endophytic Colletotrichum...,VS,SV,orig,Plant Pathogenic and Endophytic Colletotrichum...,"[plant, pathogenic, endophytic, colletotrichum...",plant pathogenic endophytic colletotrichum fru...
2,Lethal Bronzing: What you should know about th...,VS,SV,orig,Lethal Bronzing: What you should know about th...,"[lethal, bronzing, know, disease, turn, palm, ...",lethal bronzing know disease turn palm tree br...
3,Leaffooted Bug Damage in Almond Orchards Leaff...,VS,SV,orig,Leaffooted Bug Damage in Almond Orchards Leaff...,"[leaffooted, bug, damage, almond, orchard, lea...",leaffooted bug damage almond orchard leaffoote...
4,Kebbi govt battles mysterious disease affectin...,VS,SV,orig,Kebbi govt battles mysterious disease affectin...,"[kebbi, govt, battle, mysterious, disease, aff...",kebbi govt battle mysterious disease affecting...
...,...,...,...,...,...,...,...
2734,Mystery Seed Packages Appearing Once Again in ...,VS,SV,bt,Mystery Seed Packages Appearing Once Again in ...,"[mystery, seed, package, appearing, alabama, m...",mystery seed package appearing alabama mystery...
2735,ACES: Mystery Seed Packages Reappearing in Ala...,VS,SV,bt,ACES: Mystery seed packages appearing once aga...,"[ace, mystery, seed, package, reappearing, ala...",ace mystery seed package reappearing alabama a...
2736,Farmers blame unknown pests for the shortage o...,VS,SV,bt,Farmers Blame Unknown Pest As Pepper Hits ₦150...,"[farmer, blame, unknown, pest, shortage, peppe...",farmer blame unknown pest shortage pepper coun...
2737,Sharp drop in yield due to a mysterious fungal...,VS,SV,bt,Sharp decline in yield as mysterious fungal in...,"[sharp, drop, yield, due, mysterious, fungal, ...",sharp drop yield due mysterious fungal infecti...


In [3]:
train_data["text_final"].iloc[2735] == train_data["text_src"].iloc[2735]

False

In [4]:
train_data["type_article"].value_counts()

type_article
NVS    2241
VS      496
Name: count, dtype: int64

In [5]:
df["type_article"].value_counts()

type_article
NVS    2241
VS      249
Name: count, dtype: int64

#### Les textes augmentées par la rétro traduction ont une source bt 

In [33]:
## Sauvegardons les données augmentées dans un fichier csv 

train_data.to_csv("./data/data_augmented_back_translation_en_de.csv", index= False)

In [30]:
# Pour le pivot français
train_data.to_csv("./data/data_augmented_back_translation_en_fr.csv", index= False)

###  Nous allons reprendre la classification avec le fine-tuning de SBERT que nous avons fait à la phase 3 

voir le fichier notebook *fine_tuning_experiments_augmentated.ipynb*

In [2]:
import pandas as pd 
data = pd.read_csv("./data/data_augmented_back_translation.csv")

In [3]:
data

Unnamed: 0,text_final,text_clean,token_clean,source,type_article,thematique,text_src
0,Microbial Community Composition Associated wit...,microbial community composition associated pot...,"['microbial', 'community', 'composition', 'ass...",orig,VS,SV,Microbial Community Composition Associated wit...
1,Plant Pathogenic and Endophytic Colletotrichum...,plant pathogenic endophytic colletotrichum fru...,"['plant', 'pathogenic', 'endophytic', 'colleto...",orig,VS,SV,Plant Pathogenic and Endophytic Colletotrichum...
2,Lethal Bronzing: What you should know about th...,lethal bronzing know disease turn palm tree br...,"['lethal', 'bronzing', 'know', 'disease', 'tur...",orig,VS,SV,Lethal Bronzing: What you should know about th...
3,Leaffooted Bug Damage in Almond Orchards Leaff...,leaffooted bug damage almond orchard leaffoote...,"['leaffooted', 'bug', 'damage', 'almond', 'orc...",orig,VS,SV,Leaffooted Bug Damage in Almond Orchards Leaff...
4,Kebbi govt battles mysterious disease affectin...,kebbi govt battle mysterious disease affecting...,"['kebbi', 'govt', 'battle', 'mysterious', 'dis...",orig,VS,SV,Kebbi govt battles mysterious disease affectin...
...,...,...,...,...,...,...,...
2732,Mystery Seed Packages Appearing Once Again in ...,mystery seed package appearing alabama mystery...,"['mystery', 'seed', 'package', 'appearing', 'a...",bt,VS,SV,Mystery Seed Packages Appearing Once Again in ...
2733,ACES: Mystery seed packages appeared again in ...,ace mystery seed package appeared alabama ace ...,"['ace', 'mystery', 'seed', 'package', 'appeare...",bt,VS,SV,ACES: Mystery seed packages appearing once aga...
2734,"Farmers blame the plague: 150,000 per bag Farm...",farmer blame plague per bag farmer blame plagu...,"['farmer', 'blame', 'plague', 'per', 'bag', 'f...",bt,VS,SV,Farmers Blame Unknown Pest As Pepper Hits ₦150...
2735,Sharp drop in yield due to mysterious fungal i...,sharp drop yield due mysterious fungal infecti...,"['sharp', 'drop', 'yield', 'due', 'mysterious'...",bt,VS,SV,Sharp decline in yield as mysterious fungal in...


In [4]:
data["type_article"].value_counts()

type_article
NVS    2241
VS      496
Name: count, dtype: int64