### Nous allons ici faire quelques applications d'augmentation des données textuelles en suivant un tutoriel sur medium 

In [None]:
!pip install nlpaug
!pip install sacremoses

In [2]:
import nlpaug.augmenter.word as naw 


  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,
W0825 09:27:33.543000 15484 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [3]:
## Un exemple de texte qu'on veut reformuler 
text = "The quick brown fox jumps over a lazy dog"

#### Synonym Replacement 

In [4]:
syn_aug = naw.synonym.SynonymAug(aug_src="wordnet")
synonym_text = syn_aug.augment(text)
print("Synonym Text: ", synonym_text)

Synonym Text:  ['The flying john brown fox jumps all over a lazy dog']


#### Random Substitution

In [6]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print("Substituted Text: ", substituted_text)

Substituted Text:  ['The quick brown _ _ over _ lazy dog']


### Random Deletion

In [7]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print("Deletion Text: ", deletion_text)

Deletion Text:  ['The quick brown jumps over lazy']


### Random Swap

In [9]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print("Swap Text: ", swap_text)

Swap Text:  ['Quick the brown fox over jumps a dog lazy']


### Back Translation

Translate original text to other language (like french) and convert back to english language

In [10]:
back_trans_aug = naw.back_translation.BackTranslationAug()
back_trans_text = back_trans_aug.augment(text)
print("Back Translated Text: ", back_trans_text)



config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json: 0.00B [00:00, ?B/s]

vocab-tgt.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json: 0.00B [00:00, ?B/s]

vocab-tgt.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Back Translated Text:  ['The speedy brown fox jumps over a lazy dog']


### Nous allons appliquer la Rétrotraduction pour former notre premier jeu de données augmenté 
Nous allons appliquer cela sur les données de texte brute ensuite on fera encore le nettoyage 

#### Fonction de nettoyage qu'on applique depuis à nos textes 

In [12]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

# Initialisation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stopwords = {'et', 'al'} # Les expressions que nous avons retrouvé en grande quantité lors de nos expérimentations et nous avons décidé de ne pas les conserver 


def nettoyer_texte(texte):
    """
    Nettoie un texte brut :
    2. Tokenisation
    3. Nettoyage caractères spéciaux
    4. Suppression stopwords, ponctuation, custom_stopwords
    5. Lemmatisation
    """
    
    # Étape 2 : tokenisation
    tokens = word_tokenize(texte)

    # Étape 3-6 : nettoyage
    tokens_nettoyes = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\s+', '', token)
        token = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ]', '', token)

        if (
            token
            and token not in stop_words
            and token not in punctuation
            and token not in custom_stopwords
        ):
            token = lemmatizer.lemmatize(token)
            tokens_nettoyes.append(token)

    return ' '.join(tokens_nettoyes)


In [14]:
import pandas as pd
import nlpaug.augmenter.word as naw
import re
from typing import Optional

# --- 1) Utils sûrs ---
def coerce_to_str(x) -> str:
    """Force en string: join si liste, sinon cast, et strip espaces."""
    if isinstance(x, list):
        x = " ".join(map(str, x))
    elif pd.isna(x):
        x = ""
    return str(x).strip()

def simple_clean(text: str) -> str:
    """Nettoyage simple; remplace par ton pipeline habituel si tu en as un."""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)  # espaces multiples -> un espace
    return text.strip()

def safe_augment(aug, text: str) -> Optional[str]:
    """Back-translation sûre: garantie de renvoyer une str ou None si échec."""
    if not text:
        return None
    try:
        out = aug.augment(text)  # peut renvoyer str ou list[str]
        if isinstance(out, list):
            # si n>1, on prend la première; sinon, c'était déjà une liste
            out = out[0] if out else None
        return out.strip() if out else None
    except Exception as e:
        # log optionnel
        # print(f"[AUG-ERR] {e} | text head: {text[:120]}")
        return None

# --- 2) Dataset ---
df = pd.read_csv("./data/data_final_phase2_private.csv")  # colonnes: "text", "text_clean",

# Normaliser la colonne text (important pour tes exemples qui sont des listes/paragraphes)
df["text"] = df["text"].apply(coerce_to_str)

# --- 3) Back-translation (sur le texte brut, puis on re-nettoie) ---
back_trans_aug = naw.BackTranslationAug(
    from_model_name="facebook/wmt19-en-de",
    to_model_name="facebook/wmt19-de-en",
    device="cuda"  # ou "cpu" si pas de GPU
)

# Tu peux choisir un sous-échantillon au début pour tester
subset = df.sample(min(200, len(df)), random_state=42).copy()

aug_rows = []
for idx, row in subset.iterrows():
    raw = row["text"]
    # (optionnel) tronquer si très long pour éviter OOM / timeouts
    # raw = raw[:2000]

    aug_text = safe_augment(back_trans_aug, raw)
    if not aug_text:
        continue  # skip proprement

    aug_rows.append({
        "text": raw,                  # original
        "text_aug": aug_text,         # version back-translate
        "text_clean": simple_clean(aug_text),
        "type_article": row["type_article"]
    })

aug_df = pd.DataFrame(aug_rows)

# --- 4) Fusion (option 1 : concat à part) ---
# On garde df tel quel, et on a un df augmenté à part :
# aug_df: colonnes -> text (original), text_aug (nouveau), text_clean (nettoyé), label (si présent)

# --- 5) Fusion (option 2 : empiler comme nouvelles lignes pour entraîner) ---
# Si tu veux entraîner sur une seule colonne "text" (mélange original + aug),
# on duplique la structure et on renomme :
train_original = df.rename(columns={"text": "text_src"}).copy()
train_original["text_final"] = train_original["text"]
train_original["source"] = "orig"

train_aug = aug_df.rename(columns={"text_aug": "text_final"}).copy()
train_aug["text_src"] = train_aug["text"]
train_aug["source"] = "bt"

# Aligner les colonnes utiles
cols = ["text_final", "text_clean", "source"] + ["type_article"]
train_data = pd.concat(
    [
        train_original.reindex(columns=cols, fill_value=""),
        train_aug.reindex(columns=cols, fill_value="")
    ],
    ignore_index=True
)

# Maintenant, tu peux vectoriser `train_data["text_clean"]` (ou `text_final` + refaire ton pipeline de nettoyage complet).




AssertionError: Torch not compiled with CUDA enabled