In [26]:
import pandas as pd
import random
import re

In [34]:
def contaminate_text(text):
    """
    Applique toutes les contaminations au texte médical.
    """
    text = incoherence_diagnostic(text)  # Ajoute une incohérence diagnostique
    text = contradiction_interne(text)   # Ajoute des contradictions internes
    text = terminologie_erronee(text)    # Remplace des termes médicaux par des termes erronés
    text = ajout_valeurs_anormales(text) # Ajoute des valeurs médicales anormales
    text = suppression_information(text) # Supprime certaines informations du texte
    return text


In [35]:
def incoherence_diagnostic(text):
    """
    Ajoute une incohérence diagnostique plus subtile et complexe.
    """
    contradictions = [
        "Despite the clear symptoms of cardiomegaly, no enlargement of the heart is noted in the imaging results.",
        "The patient has a normal ejection fraction, but the clinical examination indicates severe heart failure.",
        "CT scan indicates consolidation in the right lung, but the patient is asymptomatic.",
        "Although no acute findings are visible, the patient shows signs of acute pulmonary edema based on history.",
        "The chest x-ray shows bilateral pleural effusion, but auscultation reveals no signs of respiratory distress."
    ]
    # Ajout d'une incohérence subtile en fonction de l'analyse sémantique du texte
    text += " " + random.choice(contradictions)
    return text


In [36]:
def contradiction_interne(text):
    """
    Ajoute une contradiction dans l'interprétation du texte de manière plus subtile.
    """
    patterns = {
        r"\bclear\b": "Despite clear lungs, ground-glass opacities are visible in both lower lobes.",
        r"\bno effusion\b": "Bilateral pleural effusions noted, despite reports stating no effusion.",
        r"\bnormal heart\b": "Heart size appears normal, yet the patient has severe cardiomegaly based on echocardiogram.",
        r"\bno consolidation\b": "The report suggests no consolidation, but significant patchy areas are visible on imaging.",
        r"\blungs are unremarkable\b": "Although the lungs appear unremarkable, subtle signs of fibrosis are detectable."
    }
    
    for pattern, contradiction in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            # Placer la contradiction en un endroit du texte plus réaliste
            sentence_end = text.rfind(".")
            text = text[:sentence_end] + " " + contradiction + text[sentence_end:]
            return text
    return text


In [37]:
def terminologie_erronee(text):
    """
    Remplace des termes médicaux corrects par des termes incorrects de manière subtile.
    """
    replacements = {
        "pneumothorax": "pneumoventis",
        "pleural effusion": "pleural inflation",
        "cardiomegaly": "cardiodilation",
        "consolidation": "consolidation tissue",
        "aorta": "cardiac aorta",
        "mediastinum": "mediocentrical space",
        "atelectasis": "pulmo-collapse",
        "granuloma": "granitoma",
        "vascularity": "vascular texture",
        "pulmonary edema": "lung wetness",
        "bronchiectasis": "bronchidilation",
        "fibrosis": "fibrotic congestion",
        "calcification": "calcium overload",
        "nodule": "nodal swelling",
        "opacity": "shadow mass",
        "infiltration": "infiltrative process",
        "embolism": "embolic trapping",
        "hemothorax": "hemopleural collection",
        "tracheal deviation": "windpipe shift",
        "hyperinflation": "overexpansion",
        "interstitial lung disease": "interstitium degradation",
        "pulmonary hypertension": "lung blood pressure disorder"
    }
    # On applique maintenant les remplacements de manière subtile et systématique
    pattern = re.compile(r'\b(' + '|'.join(replacements.keys()) + r')\b', re.IGNORECASE)

    def replace_match(match):
        term = match.group(0)
        # Applique une variante subtile du terme médical
        return replacements.get(term.lower(), term)

    return pattern.sub(replace_match, text)


In [38]:
def ajout_valeurs_anormales(text):
    """
    Ajoute des valeurs médicales anormales de manière plus contextuelle.
    """
    abnormal_values = [
        "Ejection fraction of 12%, despite a normal echocardiogram.",
        "O2 saturation at 65%, although the patient is not in distress.",
        "Tracheal deviation of 3.5 cm, despite no signs of airway obstruction.",
        "Blood pressure recorded at 220/130 mmHg, with no apparent hypertensive symptoms.",
        "Heart rate at 35 bpm, in a patient with no underlying cardiovascular conditions.",
        "Temperature of 42°C, despite being on antipyretic therapy.",
        "Blood glucose level of 450 mg/dL, despite no history of diabetes."
    ]
    # Ajout d'une valeur anormale réaliste mais incongrue
    return text + " " + random.choice(abnormal_values)


In [39]:
def suppression_information(text):
    """
    Supprime ou modifie de manière plus subtile une partie du texte.
    """
    sentences = text.split('.')
    sentences = [s.strip() for s in sentences if s.strip()]  # Nettoyer la liste
    if len(sentences) > 1:
        # Supprimer ou remplacer une phrase cruciale
        index_to_remove = random.randint(0, len(sentences) - 1)
        if random.choice([True, False]):
            sentences[index_to_remove] = "Missing clinical information."
        else:
            del sentences[index_to_remove]
    return '. '.join(sentences) + '.'


In [40]:
def contaminate_dataframe(df, contamination_rate=0.1):
    """
    Applique des contaminations sur un pourcentage du dataset.
    """
    df = df.copy()
    # Vérifier la présence de la colonne nécessaire
    if 'combined_text' not in df.columns:
        raise ValueError("La colonne 'combined_text' doit être présente dans le DataFrame.")

    # Sélectionner les indices à contaminer
    indices_to_contaminate = random.sample(range(len(df)), int(len(df) * contamination_rate))
    
    # Appliquer la contamination
    df.loc[indices_to_contaminate, 'combined_text'] = df.loc[indices_to_contaminate, 'combined_text'].apply(contaminate_text)
    
    return df

In [41]:
file_path = "../../data/cleaned/final_clean_data1.csv"
df = pd.read_csv(file_path)

# Appliquer la contamination
df_contaminated = contaminate_dataframe(df, contamination_rate=0.1)

# Sauvegarder le nouveau dataset
df_contaminated.to_csv("../../data/contaminated/data_contaminated3.csv", index=False)

In [42]:
df_contaminated.head

<bound method NDFrame.head of                                           combined_text
0     cardiac silhouette mediastinum size within nor...
1     cardiomediastinal silhouette within normal lim...
2     lung clear epanded heart mediastinum normal ac...
3     increased_opacity within right_upper_lobe poss...
4     interstitial marking diffusely prominent throu...
...                                                 ...
3414  lung clear cardiomediastinal silhouette within...
3415  sternotomy suture bypass graft placed interval...
3416  calcified mediastinal focal area consolidation...
3417  Temperature of 42°C, despite being on antipyre...
3418  lung clear bilaterally focal consolidation ple...

[3419 rows x 1 columns]>