In [102]:
import pandas as pd
import numpy as np
import string
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import gensim.downloader as api
import nltk
nltk.download('punkt_tab')
import spacy
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mariama/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/mariama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mariama/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mariama/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [103]:
acronym_definitions = {
    "COPD": "Chronic Obstructive Pulmonary Disease",
    "CT": "Computed Tomography",
    "PPD": "Purified Protein Derivative",
    "PA": "Posterior-Anterior",
    "SVC": "Superior Vena Cava",
    "CABG": "Coronary Artery Bypass Graft",
    "CHF": "Congestive Heart Failure",
    "HIV": "Human Immunodeficiency Virus",
    "MRI": "Magnetic Resonance Imaging",
    "EKG": "Electrocardiogram",
    "AP": "Anteroposterior",
    "PICC": "Peripherally Inserted Central Catheter",
    "TB": "Tuberculosis",
    "MVA": "Motor Vehicle Accident",
    "CHEST": "Thoracic Cavity (context-dependent)",
    "SOB": "Shortness of Breath",
    "MVC": "Motor Vehicle Collision",
    "IJ": "Internal Jugular",
    "ICD": "Implantable Cardioverter Defibrillator",
    "BMT": "Bone Marrow Transplant",
    "DYSPNEA": "Difficulty Breathing",
    "LAB": "Laboratory",
    "PRE": "Preoperative",
    "OP": "Operative Procedure",
    "CXR": "Chest X-Ray",
    "CP": "Chest Pain",
    "KUB": "Kidneys, Ureters, and Bladder",
    "IV": "Intravenous",
    "AML": "Acute Myeloid Leukemia",
    "ECF": "Extracellular Fluid",
    "LUNG": "Lung (context-dependent)",
    "NECK": "Neck (context-dependent)",
    "DKA": "Diabetic Ketoacidosis",
    "IVC": "Inferior Vena Cava",
    "AICD": "Automatic Implantable Cardioverter Defibrillator",
    "NG": "Nasogastric",
    "VP": "Ventriculoperitoneal",
    "PX": "Physical Examination",
    "PT": "Physical Therapy",
    "CA": "Cancer",
    "KV": "Kilovolt",
    "DVT": "Deep Vein Thrombosis",
    "CLL": "Chronic Lymphocytic Leukemia",
    "HAS": "Hypertension Associated Symptoms (context-dependent)",
    "TNF": "Tumor Necrosis Factor",
    "BR": "Breast",
    "SBP": "Systolic Blood Pressure",
    "WBC": "White Blood Cell",
    "ATV": "All-Terrain Vehicle (context-dependent)",
    "SDH": "Subdural Hematoma",
    "CCK": "Cholecystokinin",
    "XRAY": "Radiograph",
    "HX": "History",
    "APNEA": "Temporary Cessation of Breathing",
    "TKA": "Total Knee Arthroplasty",
    "PTX": "Pneumothorax",
    "HF": "Heart Failure",
    "CIWA": "Clinical Institute Withdrawal Assessment",
    "DORV": "Double Outlet Right Ventricle",
    "CVA": "Cerebrovascular Accident",
    "BNP": "B-Type Natriuretic Peptide",
    "PNA": "Pneumonia",
    "POS": "Positive",
    "DX": "Diagnosis",
    "AIDS": "Acquired Immunodeficiency Syndrome",
    "RLL": "Right Lower Lobe",
    "RT": "Radiation Therapy",
    "RUQ": "Right Upper Quadrant",
    "UIP": "Usual Interstitial Pneumonia",
    "GYN": "Gynecology",
    "ESR": "Erythrocyte Sedimentation Rate",
    "AAM": "African American Male",
    "PDA": "Patent Ductus Arteriosus",
    "SVT": "Supraventricular Tachycardia",
    "AFIB": "Atrial Fibrillation",
    "POD": "Postoperative Day",
    "VHR": "Ventral Hernia Repair",
    "UTI": "Urinary Tract Infection",
    "ORIF": "Open Reduction and Internal Fixation",
    "AVN": "Avascular Necrosis",
    "STAB": "Stab Wound (context-dependent)",
    "SP": "Status Post",
    "HCC": "Hepatocellular Carcinoma",
    "SBRT": "Stereotactic Body Radiation Therapy",
    "BCG": "Bacillus Calmette-Guérin",
    "SX": "Symptoms",
    "PM": "Post Mortem (or Pacemaker, context-dependent)",
    "CTA": "Computed Tomography Angiography",
    "AS": "Aortic Stenosis",
    "VSD": "Ventricular Septal Defect",
    "SM": "Small Molecule (context-dependent)",
    "TOF": "Tetralogy of Fallot",
    "PPM": "Permanent Pacemaker",
    "INR": "International Normalized Ratio",
    "Dr.": "doctor"
}

def acronym_means(text):
    new_text = []
    for i in text.split():
        if i.upper() in acronym_definitions:
            new_text.append(acronym_definitions[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [104]:
#definition des stopwords et Lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [105]:
# Fonction pour convertir en minuscules
def to_lowercase(text):
    return text.lower()

In [106]:
# Fonction pour supprimer la ponctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [107]:
# Fonction pour supprimer les stopwords et appliquer la lemmatisation
def remove_stopwords_and_lemmatize(text):
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

In [108]:
import re

def remove_xxx(text):
    # Ne pas supprimer les chiffres suivis de "cm" (par exemple, "12 cm")
    text = re.sub(r'\b(\d+)(?=\s*cm\b)', r'@@\1@@', text)
    text = re.sub(r'\d+', '', text)
    # Remettre les chiffres marqués (ceux suivis de "cm")
    text = text.replace('@@', '')
    text = text.replace("xxxx", "").replace("x", "")
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [109]:
# Fonction pour générer des n_grame (trigrams)
def generate_n_grams(text, min_count=1, threshold=1):
    
    if isinstance(text, str):
        # Diviser le texte en tokens
        corpus = [text.split()]
    else:
        raise ValueError("Le texte d'entrée doit être une chaîne de caractères.")

    # Modèle de bigrammes
    bigram_model = Phrases(corpus, min_count=min_count, threshold=threshold)
    bigrams = [bigram_model[doc] for doc in corpus]

    # Modèle de trigrammes
    trigram_model = Phrases(bigrams, min_count=min_count, threshold=threshold)
    trigrams = [trigram_model[doc] for doc in bigrams]

    # Conversion en trigrammes
    result = []
    for doc in trigrams:
        for gram in doc:
            if "_" in gram: 
                result.append(gram)
            else:
                result.append(gram)  # Garder les mots non reliés
    return ' '.join(result)

In [110]:
# Fonction principale de prétraitement pour une colonne de texte
def preprocess_text_column(column):
    try:
        column = column.astype(str)
        column = column.apply(acronym_means)
        column = column.apply(to_lowercase)
        column = column.apply(remove_punctuation)
        column = column.apply(remove_stopwords_and_lemmatize)
        column = column.apply(remove_xxx)
        column = column.apply(generate_n_grams)
        return column
    except Exception as e:
        print("Erreur dans le prétraitement :", e)
        return None

In [111]:
#fonction pour combiner des columns
def combine_columns(df, text_columns):
    """
    Combine les colonnes spécifiées d'un DataFrame en une seule colonne nommée 'combined_text',
    puis supprime toutes les autres colonnes.
    Retourne un DataFrame ne contenant que la colonne combinée.
    """
    try:
        # Combiner les colonnes spécifiées dans une seule chaîne de texte par ligne
        df['combined_text'] = df[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
        # Ne conserver que la colonne combinée
        df = df[['combined_text']]
        return df
    except Exception as e:
        print("Erreur lors de la combinaison des colonnes :", e)
        return None


In [112]:
#fonction pour combiner les dataFrames
"""
def combine_dataframes(df1, df2):
    try:
        if df1.shape[1] != 1 or df2.shape[1] != 1:
            print("Erreur : Les DataFrames doivent avoir une seule colonne.")
            return None
        df1 = df1.iloc[:, 0]
        df2 = df2.iloc[:, 0]
        combined_df = pd.concat([df1, df2], ignore_index=True)
        combined_df = combined_df.to_frame('combined_text')
        return combined_df
    except Exception as e:
        print("Erreur lors de la combinaison des DataFrames :", e)
        return None
"""

'\ndef combine_dataframes(df1, df2):\n    try:\n        if df1.shape[1] != 1 or df2.shape[1] != 1:\n            print("Erreur : Les DataFrames doivent avoir une seule colonne.")\n            return None\n        df1 = df1.iloc[:, 0]\n        df2 = df2.iloc[:, 0]\n        combined_df = pd.concat([df1, df2], ignore_index=True)\n        combined_df = combined_df.to_frame(\'combined_text\')\n        return combined_df\n    except Exception as e:\n        print("Erreur lors de la combinaison des DataFrames :", e)\n        return None\n'

In [113]:
def preprocess_dataframe(df, column_text='combined_text'):
    """
    Applique le prétraitement sur une colonne spécifique du DataFrame (par défaut 'combined_text').

    Args:
        df (DataFrame): Le DataFrame contenant les données à traiter.
        column_text (str): Le nom de la colonne à traiter (par défaut 'combined_text').

    Returns:
        bool: True si le prétraitement a réussi, False sinon.
    """
    try:
        if column_text not in df.columns:
            print(f"Erreur : La colonne '{column_text}' est manquante dans le DataFrame.")
            return False
        
        # Appliquer le prétraitement sur la colonne spécifiée
        df[column_text] = preprocess_text_column(df[column_text])
        if df[column_text].isnull().all():
            print("Erreur : Le prétraitement a échoué, aucune donnée valide.")
            return False

        print("Prétraitement réussi.")
        return True

    except Exception as e:
        print(f"Erreur dans le prétraitement global : {e}")
        return False



In [114]:
def execute_pipeline(csv_path, csv2_path, text_columns):
    """
    Exécute le pipeline complet de combinaison des fichiers, prétraitement des données,
    et renvoi du DataFrame traité ou d'un message d'erreur.

    Args:
        csv_path (str): Chemin vers le fichier CSV.
        excel_path (str): Chemin vers le fichier Excel.
        text_columns1 (list): Colonnes à combiner dans le DataFrame CSV.

    Returns:
        final_dat: Affiche le dataframe final si tout se passe bien .
    """
    try:
        data1 = pd.read_csv(csv_path)
        data2 = pd.read_csv(csv2_path)

        # Étape 2 : Combiner les colonnes de chaque DataFrame
        new_data1 = combine_columns(data1, text_columns)
        if new_data1 is not None and data2 is not None:
            # Étape 4 : Appliquer le prétraitement global sur la colonne 'combined_text'
            success1 = preprocess_dataframe(new_data1, column_text='combined_text')
            success2 = preprocess_dataframe(data2, column_text="Clinician's Notes")
            if success1&success2:
                print("Le pipeline complet a réussi.")
                return new_data1, data2
            else:
                print("Le prétraitement a échoué.")
        else:
            print("Erreur les dataset sont vides")
    except Exception as e:
        print(f"Une erreur s'est produite dans le pipeline : {e}")


In [115]:
csv_path = "final_dataset.csv"
csv2_path = "Radiologists Report.csv"
text_columns = ['abstract_FINDINGS', 'abstract_IMPRESSION']
final_data1, final_data2 = execute_pipeline(csv_path, csv2_path, text_columns)
final_data1.to_csv('clean_data.csv', index=False)
final_data2.to_csv('clean_data2.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_text] = preprocess_text_column(df[column_text])


Prétraitement réussi.
Prétraitement réussi.
Le pipeline complet a réussi.


In [116]:
#nlp = spacy.load("/export/spacy-libs/fr_core_news_sm/fr_core_news_sm-3.1.0")
nlp = spacy.load("fr_core_news_sm")

In [117]:
# Densité des points dans les dimensions de l'espace des plongements lexicaux
from sklearn.neighbors import KernelDensity
def estimate_density(x_vals, y_vals, bandwidth=0.1):
    """
    Estime la densité des points à l'aide du Kernel Density Estimation (KDE) dans le but de pouvoir donner un zoom 
    de la visualisation des points.
    :param x_vals: Liste des valeurs de la dimension 1
    :param y_vals: Liste des valeurs de la dimension 2
    :param bandwidth: Paramètre du noyau KDE, plus petit = plus précis, plus grand = plus lisse
    :return: Coordonnées de la zone de densité élevée
    """
    # Empile les valeurs x et y pour l'estimation KDE
    points = np.vstack([x_vals, y_vals]).T

    # KDE pour estimer la densité
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde.fit(points)
    
    # Calcul des densités pour une grille de points
    x_grid = np.linspace(min(x_vals), max(x_vals), 100)
    y_grid = np.linspace(min(y_vals), max(y_vals), 100)
    X, Y = np.meshgrid(x_grid, y_grid)
    grid_points = np.vstack([X.ravel(), Y.ravel()]).T
    densities = np.exp(kde.score_samples(grid_points))

    # Trouver la région la plus dense
    max_density_index = np.argmax(densities)
    max_density_point = grid_points[max_density_index]

    return max_density_point, densities.reshape(X.shape)


In [118]:
def visualisation_with_zoom(x_vals, y_vals, labels, zoom=True, bandwidth=0.1):
    """
    Affiche un nuage de points avec un zoom automatique sur la zone la plus dense.
    :param x_vals: Liste des valeurs de la dimension 1
    :param y_vals: Liste des valeurs de la dimension 2
    :param labels: Liste des labels pour chaque point
    :param zoom: Booléen indiquant si un zoom doit être effectué (par défaut True)
    :param bandwidth: Paramètre de l'estimation de la densité
    """
    # Affichage du premier graphique sans zoom
    plt.figure(figsize=(16, 16))
    plt.scatter(x_vals, y_vals, alpha=0.7, s=50)
    plt.grid(visible=True, linestyle="--", linewidth=0.5, alpha=0.7)
    for i in range(len(labels)):
        plt.annotate(
            labels[i],
            (x_vals[i], y_vals[i]),
            fontsize=9,
            alpha=0.8,
            textcoords="offset points",
            xytext=(5, 5)
        )
    plt.title("Nuage de points des plongements lexicaux - Sans zoom", fontsize=16)
    plt.xlabel("Dimension 1", fontsize=14)
    plt.ylabel("Dimension 2", fontsize=14)
    plt.show()

    # Estimation de la densité
    max_density_point, densities = estimate_density(x_vals, y_vals, bandwidth)

    # Définir les limites du zoom autour de la zone de densité maximale
    zoom_limits = {
        'x': (max_density_point[0] - 1, max_density_point[0] + 1),
        'y': (max_density_point[1] - 1, max_density_point[1] + 1)
    }

    # Affichage avec zoom automatique basé sur la densité
    plt.figure(figsize=(16, 16))
    plt.scatter(x_vals, y_vals, alpha=0.7, s=50)

    # Si le zoom est activé, ajuster les limites
    if zoom:
        plt.xlim(zoom_limits['x'])  # Plage des valeurs pour la dimension 1
        plt.ylim(zoom_limits['y'])  # Plage des valeurs pour la dimension 2

    plt.grid(visible=True, linestyle="--", linewidth=0.5, alpha=0.7)
    for i in range(len(labels)):
        plt.annotate(
            labels[i],
            (x_vals[i], y_vals[i]),
            fontsize=9,
            alpha=0.8,
            textcoords="offset points",
            xytext=(5, 5)
        )
    plt.title("Nuage de points des plongements lexicaux - Avec zoom sur densité", fontsize=16)
    plt.xlabel("Dimension 1", fontsize=14)
    plt.ylabel("Dimension 2", fontsize=14)
    plt.show()




In [119]:
x_vals_pca, y_vals_pca, labels_pca = reduce_dimensions_pca(model.wv, 1000)


visualisation_with_zoom(x_vals_pca, y_vals_pca, labels_pca, zoom=True, bandwidth=0.1)


NameError: name 'reduce_dimensions_pca' is not defined

In [None]:
def key_words(column, top_n=5):
    vectorizer = TfidfVectorizer()
    x = vectorizer.fit_transform(column)
    feature_names = vectorizer.get_feature_names_out()
    keywords_per_text = []
    for doc_idx in range(x.shape[0]):
        # Récupérer les scores TF-IDF pour le texte courant
        tfidf_scores = x[doc_idx].toarray()[0]

        # Associer chaque mot avec son score TF-IDF
        word_scores = [(feature_names[i], tfidf_scores[i]) for i in range(len(feature_names))]

        # Trier les mots par score décroissant
        sorted_words = sorted(word_scores, key=lambda x: x[1], reverse=True)

        # Récupérer les top N mots-clés
        keywords = [word for word, score in sorted_words[:top_n]]
        keywords_per_text.append(keywords)

    return keywords_per_text
    
csv_path = "clean_data.csv"
csv_path2 = "clean_data2.csv"
data = pd.read_csv(csv_path)
data2 = pd.read_csv(csv_path2)
column = preprocess_text_column(data["combined_text"])
column2 = preprocess_text_column(data2["Clinician's Notes"])
data1_kw = key_words(column)
data2_kw = key_words(column2)

data2_kw

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def contamination_function(data1, data2, data1_kw, data2_kw, contamination_rate=0.3):
    contaminated_texts = []
    contamination_log = []

    #pour mesurer la similarité entre les mots-clés
    vectorizer = TfidfVectorizer()
    all_keywords = data1_kw + data2_kw
    tfidf_matrix = vectorizer.fit_transform([' '.join(kw) for kw in all_keywords])
    base_vectors = tfidf_matrix[:len(data1_kw)]
    anomaly_vectors = tfidf_matrix[len(data1_kw):]
    
    for i, base_text in enumerate(data1):
        words = base_text.split()  
        total_words = len(words)
        num_words_to_contaminate = max(1, int(total_words * contamination_rate))  # mots à contaminer
        # Trouver les anomalies les plus similaires (mots clés du text avec toutes les anomalies)
        similarities = cosine_similarity(base_vectors[i], anomaly_vectors)
        best_anomaly_idx = similarities.argmax()
        anomaly_text = data2[best_anomaly_idx]
        anomaly_words = anomaly_text.split()  # les mots de l'anomalie
        # Contaminer les mots du texte de base
        contaminated_words = words[:]
        contaminated_positions = []

        for _ in range(num_words_to_contaminate):
            # Choisir un mot d'anomalie aléatoire
            anomaly_word = random.choice(anomaly_words)
            # Choisir une position aléatoire dans le texte de base
            position = random.randint(0, total_words - 1)

            # Remplacer le mot à cette position
            contaminated_words[position] = anomaly_word
            contaminated_positions.append((position, anomaly_word))

        # Reconstituer le texte contaminé
        contaminated_text = ' '.join(contaminated_words)
        contaminated_texts.append(contaminated_text)

        # Journaliser les contaminations
        contamination_log.append({
            "base_text": base_text,
            "\ncontaminated_text": contaminated_text,
            "\ncontaminated_positions": contaminated_positions,
            "\nanomaly_text": anomaly_text
        })

    return contaminated_texts, contamination_log


In [None]:
base_texts = column
anomaly_texts = column2
base_keywords = data1_kw
anomaly_keywords = data2_kw

contaminated_texts, log = contamination_function(base_texts, anomaly_texts, base_keywords, anomaly_keywords)

In [None]:
base_texts = column
anomaly_texts = column2
base_keywords = data1_kw
anomaly_keywords = data2_kw

contaminated_texts, log = contamination_function(base_texts, anomaly_texts, base_keywords, anomaly_keywords)
for original, contaminated in zip(base_texts, contaminated_texts):
    print("Texte original :")
    print(original)
    print("\nTexte contaminé :")
    print(contaminated)
    print("="*50)

print("Journal des anomalies :")
for entry in log:
    print(entry)

In [None]:
len(log)

In [None]:
len(column)

In [None]:
#jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
wv = api.load('glove-wiki-gigaword-300')

# 