In [1]:
import gensim.downloader as api
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.decomposition import PCA
# from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

2025-03-24 16:14:59.921612: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-24 16:14:59.924479: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-24 16:14:59.980893: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-24 16:14:59.982072: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
csv = "../../data/cleaned/final_clean_data1.csv"
df = pd.read_csv(csv)

In [37]:
heart_keywords = [
    'heart', 'cardiac', 'cardiomegaly', 'cardiothoracic', 'cardiovascular',
    'cardiopulmonary', 'vascular', 'aorta', 'aortic', 'ventricle', 'ventricular',
    'atrium', 'atrial', 'pericardial', 'pericardium', 'enlargement', 'heart_size',
    'mediastinum', 'mediastinal', 'silhouette', 'congestive', 'congestion',
    'vascularity', 'vascular_congestion', 'cardiac_border', 'cardiomegalia',
    'cardiac_enlargement'
]
lung_keywords = [
    'lung', 'lungs', 'pulmonary', 'pleural', 'pleura', 'pneumothorax',
    'infiltrate', 'infiltration', 'opacity', 'opacities', 'consolidation',
    'emphysema', 'hyperinflation', 'interstitial', 'atelectasis', 'edema',
    'bronchial', 'bronchi', 'airspace', 'alveolar', 'fibrosis', 'pneumonia',
    'trachea', 'hilar', 'effusion', 'hemithorax', 'costophrenic', 'parenchymal',
    'bilateral', 'unilateral', 'lobar', 'lobes', 'upper_lobe', 'lower_lobe'
]

def assign_subject_code(text):
    heart_score = sum(1 for word in heart_keywords if word in text)
    lung_score = sum(1 for word in lung_keywords if word in text)
    
    if lung_score > heart_score:
        return 1  # Poumon
    elif heart_score > lung_score:
        return 2  # Cœur
    else:
        return 0  # Inconnu

df['sujet'] = df['combined_text'].apply(assign_subject_code)

In [38]:
# Séparer les datasets
df_lung = df[df['sujet'] == 1].reset_index(drop=True)
df_heart = df[df['sujet'] == 2].reset_index(drop=True)

# (Optionnel) Afficher les tailles
print(f" Poumon : {len(df_lung)} lignes")
print(f" Cœur : {len(df_heart)} lignes")

 Poumon : 2815 lignes
 Cœur : 604 lignes


In [39]:
df_lung

Unnamed: 0,combined_text,sujet
0,cardiac silhouette mediastinum size within nor...,1
1,cardiomediastinal silhouette within normal lim...,1
2,increased_opacity within right_upper_lobe poss...,1
3,interstitial marking diffusely prominent throu...,1
4,heart size pulmonary vascularity appear within...,1
...,...,...
2810,heart size normal lung clear normal pneumonia ...,1
2811,lung clear cardiomediastinal silhouette within...,1
2812,sternotomy suture bypass graft placed interval...,1
2813,calcified mediastinal focal area consolidation...,1


In [31]:
def key_words(column, top_n=20):
    """
    En utilisant la métrique tf-dif, la fonction retourne les top_n premiers mots-clés dans le dataset.
    """
    vectorizer = TfidfVectorizer()
    x = vectorizer.fit_transform(column)
    feature_names = vectorizer.get_feature_names_out()
    
    kw_per_text = []
    for doc_idx in range(x.shape[0]):
        # récupérer les scores tf-dif pour le texte courant
        tfidf_scores = x[doc_idx].toarray()[0]

        # associer chaque mot avec son score tf-dif
        word_scores = [(feature_names[i], tfidf_scores[i]) for i in range(len(feature_names))]

        # trier les mots par score décroissant
        sorted_words = sorted(word_scores, key=lambda x: x[1], reverse=True)

        # récupérer les top_n mots-clés
        keywords = [word for word, score in sorted_words[:top_n]]
        kw_per_text.append(keywords)

    return kw_per_text

In [32]:
key_words(df_lung['combined_text'])

[['edema',
  'cardiac',
  'mediastinum',
  'chest',
  'evidence',
  'normal',
  'silhouette',
  'consolidation',
  'limit',
  'within',
  'pulmonary',
  'size',
  'focal',
  'pneumothora',
  'pleural',
  'effusion',
  '02010',
  '09',
  '10',
  '10th'],
 ['inflated',
  'normally',
  'bone',
  'within',
  'upper',
  'granuloma',
  'process',
  'calcified',
  'lung',
  'stable',
  'acute',
  'without',
  'right',
  'evidence',
  'cardiomediastinal',
  'airspace',
  'contour',
  'silhouette',
  'disease',
  'abnormality'],
 ['increased_opacity',
  'left_5th_rib_may',
  'represent_focal_airspace_disease',
  'focal_consolidation',
  'right_upper_lobe',
  'associated',
  'atelectasis',
  'overlying',
  'mass',
  'may_represent',
  'possible',
  'lesion',
  'recommend',
  'opacity',
  'midlung',
  'tomography',
  'computed',
  'evaluation',
  'bone',
  'posterior'],
 ['diffusely',
  'fibrosis',
  'throughout',
  'diffuse',
  'marking',
  'prominent',
  'visible',
  'interstitial',
  'normal',