### Nous allons ici faire quelques applications d'augmentation des données textuelles en suivant un tutoriel sur medium 

In [None]:
!pip install nlpaug
!pip install sacremoses

In [1]:
import nlpaug.augmenter.word as naw 


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Un exemple de texte qu'on veut reformuler 
text = "The quick brown fox jumps over a lazy dog"

In [4]:
import os, nltk

# 1) Dossier local et inscriptible pour les données NLTK
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DIR)

# 2) Téléchargements nécessaires
# - wordnet + omw-1.4 : pour les synonymes
# - averaged_perceptron_tagger_eng : POS tagger (NLTK 3.8+)
# - averaged_perceptron_tagger : par compatibilité descendante (certaines libs le demandent encore)
# - punkt : tokenisation de base (utile selon les tokenizers)
for pkg in ["wordnet", "omw-1.4", "averaged_perceptron_tagger_eng",
            "averaged_perceptron_tagger", "punkt"]:
    try:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download failed for {pkg}: {e}")


#### Synonym Replacement 

In [5]:
syn_aug = naw.synonym.SynonymAug(aug_src="wordnet")
synonym_text = syn_aug.augment(text)
print("Synonym Text: ", synonym_text)

Synonym Text:  ['The quick robert brown fox bound over a lazy wienerwurst']


#### Random Substitution

In [6]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print("Substituted Text: ", substituted_text)

Substituted Text:  ['_ quick brown _ jumps over a lazy _']


### Random Deletion

In [7]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print("Deletion Text: ", deletion_text)

Deletion Text:  ['The jumps over a lazy dog']


### Random Swap

In [8]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print("Swap Text: ", swap_text)

Swap Text:  ['Quick the brown jumps fox over lazy a dog']


### Back Translation

Translate original text to other language (like french) and convert back to english language

In [9]:
back_trans_aug = naw.back_translation.BackTranslationAug()
back_trans_text = back_trans_aug.augment(text)
print("Back Translated Text: ", back_trans_text)

Back Translated Text:  ['The speedy brown fox jumps over a lazy dog']


### Nous allons appliquer la Rétrotraduction pour former notre premier jeu de données augmenté 
Nous allons appliquer cela sur les données de texte brute ensuite on fera encore le nettoyage 

In [23]:
import os, nltk

# Dossier local pour les données NLTK (avec droits d’écriture)
NLTK_DIR = os.path.expanduser("~/nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_DIR)

# Paquets requis (NLTK 3.8+)
for pkg in [
    "punkt",                         # tokeniseur historique
    "punkt_tab",                     # depuis NLTK 3.8
    "stopwords",
    "wordnet", "omw-1.4",
    "averaged_perceptron_tagger_eng",
    "averaged_perceptron_tagger",    # compat descente
]:
    try:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download failed for {pkg}: {e}")


In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
from typing import Optional, List
import torch
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

# ==== Init NLTK stuff ====
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stopwords = {'et', 'al'}

# ==== Utils ====
def coerce_to_str(x) -> str:
    if isinstance(x, list):
        x = " ".join(map(str, x))
    elif pd.isna(x):
        x = ""
    return str(x).strip()

def safe_augment(aug, text: str) -> Optional[str]:
    if not text:
        return None
    try:
        out = aug.augment(text)
        if isinstance(out, list):
            out = out[0] if out else None
        return out.strip() if out else None
    except Exception:
        return None

#  retourne une LISTE de tokens nettoyés
def nettoyer_texte_tokens(texte: str) -> List[str]:
    tokens = word_tokenize(texte)
    tokens_nettoyes = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\s+', '', token)
        token = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ]', '', token)
        if token and token not in stop_words and token not in punctuation and token not in custom_stopwords:
            token = lemmatizer.lemmatize(token)
            tokens_nettoyes.append(token)
    return tokens_nettoyes

def tokens_to_text(tokens: List[str]) -> str:
    return " ".join(tokens)

# ==== Device & models ====
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device choisi :", device)

from_model, to_model = (
    ("facebook/wmt19-en-de", "facebook/wmt19-de-en") if device == "cuda"
    else ("Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-de-en")
)

back_trans_aug = naw.BackTranslationAug(
    from_model_name=from_model,
    to_model_name=to_model,
    device=device,
    batch_size=32 if device == "cuda" else 8,
    max_length=256
)

# ==== Load data ====
# attendu: colonnes min: text, type_article, thematique
# optionnel: text_clean, token_clean
df = pd.read_csv("./data/data_final_phase2_private.csv")
df["text"] = df["text"].apply(coerce_to_str)

# Si token_clean/text_clean manquent ou sont NaN sur certaines lignes, on (re)calcule
if "token_clean" not in df.columns:
    df["token_clean"] = df["text"].apply(nettoyer_texte_tokens)
else:
    # Remplir les trous éventuels
    df.loc[df["token_clean"].isna(), "token_clean"] = df.loc[df["token_clean"].isna(), "text"].apply(nettoyer_texte_tokens)

if "text_clean" not in df.columns:
    df["text_clean"] = df["token_clean"].apply(tokens_to_text)
else:
    # Si certaines lignes sont NaN, on les régénère
    mask_nan = df["text_clean"].isna()
    df.loc[mask_nan, "text_clean"] = df.loc[mask_nan, "token_clean"].apply(tokens_to_text)

# ==== Back-translation sur TOUT le dataset (1:1) ====
subset = df  # ou df.sample(n=..., random_state=42) si tu nous voulons limiter

aug_rows = []
for _, row in subset.iterrows():
    raw = row["text"]
    aug_text = safe_augment(back_trans_aug, raw[:2000])  # tronque si très long
    if not aug_text:
        continue

    tokens_bt = nettoyer_texte_tokens(aug_text)
    aug_rows.append({
        # trace
        "text_src": raw,              # texte source (original)
        "text_final": aug_text,       # texte utilisé pour l'entraînement (version BT)
        "source": "bt",

        # colonnes "clean"
        "token_clean": tokens_bt,     # liste de tokens
        "text_clean": tokens_to_text(tokens_bt),

        # labels/infos
        "type_article": row["type_article"],
        "thematique": row.get("thematique", ""),
    })

aug_df = pd.DataFrame(aug_rows)

# ==== Prépare bloc original pour empilement ====
train_original = df.copy()
train_original["text_src"] = train_original["text"]          # garder la source
train_original["text_final"] = train_original["text"]        # pour l'entraînement texte original
train_original["source"] = "orig"

# ==== Harmoniser les colonnes ====
cols = ["text_final", "text_clean", "token_clean", "source", "type_article", "thematique", "text_src"]

# Attention: si dans le CSV d'origine `token_clean` est une chaîne (ex: "word1 word2"),
# on préfère le convertir en liste pour cohérence. Sinon, laisse tel quel.
# Ici on essaie de garantir une liste; sinon on la reconstruit depuis text_clean.
def ensure_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        return x.split()
    return []

train_original = train_original.reindex(columns=cols, fill_value="")
train_original["token_clean"] = train_original["token_clean"].apply(ensure_list)
train_original["text_clean"] = train_original["text_clean"].astype(str)

train_aug = aug_df.reindex(columns=cols, fill_value="")

# ==== Empilement final ====
train_data = pd.concat([train_original, train_aug], ignore_index=True)
print("Taille finale du dataset :", train_data.shape)


Device choisi : cuda


In [16]:
train_data

Unnamed: 0,text_final,text_clean,source,type_article
0,Microbial Community Composition Associated wit...,microbial community composition associated pot...,orig,VS
1,Plant Pathogenic and Endophytic Colletotrichum...,plant pathogenic endophytic colletotrichum fru...,orig,VS
2,Lethal Bronzing: What you should know about th...,lethal bronzing know disease turn palm tree br...,orig,VS
3,Leaffooted Bug Damage in Almond Orchards Leaff...,leaffooted bug damage almond orchard leaffoote...,orig,VS
4,Kebbi govt battles mysterious disease affectin...,kebbi govt battle mysterious disease affecting...,orig,VS
...,...,...,...,...
2685,SHANGHAI (Reuters) - China confirmed outbreaks...,SHANGHAI (Reuters) - China confirmed outbreaks...,bt,NVS
2686,The drama of the secular plants of -----------...,The drama of the secular plants of -----------...,bt,NVS
2687,Postponement letter answers parliamentary ques...,Postponement letter answers parliamentary ques...,bt,NVS
2688,Brazil: Fundecitrus annual study shows increas...,Brazil: Fundecitrus annual study shows increas...,bt,NVS


In [20]:
train_data["source"].value_counts()

source
orig    2490
bt       200
Name: count, dtype: int64

#### Les textes augmentées par la rétro traduction ont une source bt 

In [21]:
df

Unnamed: 0,text,tokens_clean,text_clean,type_article,thematique
0,Microbial Community Composition Associated wit...,"['microbial', 'community', 'composition', 'ass...",microbial community composition associated pot...,VS,SV
1,Plant Pathogenic and Endophytic Colletotrichum...,"['plant', 'pathogenic', 'endophytic', 'colleto...",plant pathogenic endophytic colletotrichum fru...,VS,SV
2,Lethal Bronzing: What you should know about th...,"['lethal', 'bronzing', 'know', 'disease', 'tur...",lethal bronzing know disease turn palm tree br...,VS,SV
3,Leaffooted Bug Damage in Almond Orchards Leaff...,"['leaffooted', 'bug', 'damage', 'almond', 'orc...",leaffooted bug damage almond orchard leaffoote...,VS,SV
4,Kebbi govt battles mysterious disease affectin...,"['kebbi', 'govt', 'battle', 'mysterious', 'dis...",kebbi govt battle mysterious disease affecting...,VS,SV
...,...,...,...,...,...
2485,Ministry Asserts 59 Specimens Test Negative fo...,"['ministry', 'asserts', 'specimen', 'test', 'n...",ministry asserts specimen test negative minist...,NVS,SA
2486,China Ramps Up Imports of US Pork as Spreads ...,"['china', 'ramp', 'import', 'u', 'pork', 'spre...",china ramp import u pork spread china ramp imp...,NVS,SA
2487,ASF China: Large farm in Jiangsu hit; virus re...,"['asf', 'china', 'large', 'farm', 'jiangsu', '...",asf china large farm jiangsu hit virus reach g...,NVS,SA
2488,The CSIC leads an international project to mit...,"['csic', 'lead', 'international', 'project', '...",csic lead international project mitigate damag...,NVS,SV
