In [2]:
import pandas as pd
import random
import re

In [3]:
def contaminate_text(text):
    """
    Applique une contamination aléatoire au texte médical.
    """
    contamination_functions = [
        incoherence_diagnostic, contradiction_interne,
        terminologie_erronee, ajout_valeurs_anormales,
        suppression_information]
    func = random.choice(contamination_functions)
    return func(text)

In [4]:
def incoherence_diagnostic(text):
    """
    Ajoute une incohérence diagnostique évidente.
    """
    contradictions = [
        "No acute findings despite visible abnormalities.",
        "Clear lungs, yet multiple infiltrates noted.",
        "No cardiomegaly, but heart appears enlarged."]
    return text + " " + random.choice(contradictions)

In [5]:
def contradiction_interne(text):
    """
    Ajoute une contradiction dans l'interprétation du texte.
    """
    patterns = {
        r"\bclear\b": "Multiple opacities are present.",
        r"\bno effusion\b": "Bilateral pleural effusions detected.",
        r"\bnormal heart\b": "Cardiomegaly observed.",
        r"\bno consolidation\b": "Patchy consolidation evident in lower lobes.",
        r"\blungs are unremarkable\b": "Diffuse ground-glass opacities detected."
    }
    for pattern, contradiction in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return text + " " + contradiction
    return text

In [6]:
def terminologie_erronee(text):
    """
    Remplace des termes médicaux corrects par des termes incorrects.
    """
    replacements = {
        "pneumothorax": "pneumoventis",
        "pleural effusion": "pleural inflation",
        "cardiomegaly": "cardiodilation",
        "consolidation": "densification",
        "aorta": "cardiac tube",
        "mediastinum": "mediocenter",
        "atelectasis": "pulmo-collapse",
        "granuloma": "granitoma",
        "vascularity": "blood-vessel-density",
        "pulmonary edema": "lung wetness",
        "bronchiectasis": "bronchidilation",
        "fibrosis": "fibrotic congestion",
        "calcification": "calcium overload",
        "nodule": "nodal swelling",
        "opacity": "shadow mass",
        "infiltration": "infiltrative process",
        "embolism": "embolic trapping",
        "hemothorax": "hemopleural collection",
        "tracheal deviation": "windpipe shift",
        "hyperinflation": "overexpansion",
        "interstitial lung disease": "interstitium degradation",
        "pulmonary hypertension": "lung blood pressure disorder"
    } 
    pattern = re.compile(r'\b(' + '|'.join(replacements.keys()) + r')\b', re.IGNORECASE)

    def replace_match(match):
        term = match.group(0)
        return replacements.get(term.lower(), term)

    return pattern.sub(replace_match, text)

In [7]:
def ajout_valeurs_anormales(text):
    """
    Ajoute des valeurs médicales anormales dans le texte.
    """
    abnormal_values = [
        "Ejection fraction of 12%.",
        "O2 saturation at 65%.",
        "Tracheal deviation of 3.5 cm.",
        "Blood pressure recorded at 220/130 mmHg.",
        "Heart rate at 35 bpm.",
        "Temperature of 42°C.",
        "Blood glucose level of 450 mg/dL."
    ]
    return text + " " + random.choice(abnormal_values)

In [8]:
def suppression_information(text):
    """
    Supprime aléatoirement une phrase du texte.
    """
    sentences = text.split('.')
    sentences = [s.strip() for s in sentences if s.strip()]  # Nettoyer la liste
    if len(sentences) > 1:
        del sentences[random.randint(0, len(sentences) - 1)]
    return '. '.join(sentences) + '.'

In [11]:
def contaminate_dataframe(df, contamination_rate=0.1):
    """
    Applique des contaminations sur un pourcentage du dataset.
    """
    df = df.copy()
    # Vérifier la présence de la colonne nécessaire
    if 'combined_text' not in df.columns:
        raise ValueError("La colonne 'combined_text' doit être présente dans le DataFrame.")

    # Sélectionner les indices à contaminer
    indices_to_contaminate = random.sample(range(len(df)), int(len(df) * contamination_rate))
    
    # Appliquer la contamination
    df.loc[indices_to_contaminate, 'combined_text'] = df.loc[indices_to_contaminate, 'combined_text'].apply(contaminate_text)
    
    return df

In [13]:
file_path = "../../data/cleaned/final_clean_data1.csv"
df = pd.read_csv(file_path)

# Appliquer la contamination
df_contaminated = contaminate_dataframe(df, contamination_rate=0.1)

# Sauvegarder le nouveau dataset
df_contaminated.to_csv("../../data/cleaned/data_contaminated3.csv", index=False)