<a href="https://colab.research.google.com/github/sathishbandaru/6671/blob/master/mini_project_text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [3]:
cxr_df=pd.read_csv('/content/drive/My Drive/Mini Project/cxrfolder/cxr_df.csv')
labeled_df=pd.read_csv('/content/drive/My Drive/Mini Project/cxrfolder/labeled_reports.csv')

In [4]:
labeled_df['path']=cxr_df['path']

In [5]:
sample_df=labeled_df.sample(10)

In [6]:
labeled_df.columns

Index(['Report Impression', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
       'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
       'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
       'Fracture', 'Support Devices', 'No Finding', 'path'],
      dtype='object')

In [7]:
atelectasis_df=labeled_df[labeled_df['Atelectasis']==1]

In [8]:
len(atelectasis_df)

31042

In [9]:
reports=atelectasis_df['Report Impression']
len(reports)

31042

In [10]:
import spacy
from tqdm import tqdm

def extract_atelectasis_observations(reports):
    nlp = spacy.load("en_core_web_sm")

    atelectasis_synonyms = [
        "partial lung collapse",
        "lung collapse",
        "partial lung collapses",
        "bibasilar atelectatic changes persist",
        "mild left basal atelectasis",
        "lung volume loss",
        "collapsed lung segment",
        "alveolar collapse",
        "pulmonary collapse",
        "subsegmental atelectasis",
        "compression atelectasis",
        "obstructive atelectasis",
        "resorption atelectasis",
        "atelectasis",
        "atelectatic"
    ]

    atelectasis_observations = []

    for report in tqdm(reports):
        try:
            doc = nlp(report)
        except Exception as e:
            print(f"Error processing report: {report}")
            print(f"Error details: {e}")
            continue

        atelectasis_info = ""
        for sent in doc.sents:
            try:

                if any(synonym in sent.text.lower() for synonym in atelectasis_synonyms):

                    independent_clause = None
                    for clause in sent.noun_chunks:
                        if "atelectasis" in clause.text.lower() or any(synonym in clause.text.lower() for synonym in atelectasis_synonyms):
                            independent_clause = clause.text
                            break


                    atelectasis_info += (independent_clause or sent.text) + " "
            except Exception as e:
                print(f"Error processing sentence: {sent.text}")
                print(f"Error details: {e}")

        atelectasis_observations.append(atelectasis_info.strip() or "No atelectasis information")

    return atelectasis_observations


result = extract_atelectasis_observations(reports)


100%|██████████| 31042/31042 [11:20<00:00, 45.62it/s]


In [11]:
len(result)

31042

In [12]:
atelectasis_df['text']=result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  atelectasis_df['text']=result


In [13]:
atelectasis_df['text'].value_counts()

Unnamed: 0_level_0,count
text,Unnamed: 1_level_1
atelectasis,6251
No atelectasis information,1997
bibasilar atelectasis,1072
lower lobe atelectasis,831
compressive atelectasis,806
...,...
most likely chronic atelectasis,1
increased bilateral mild atelectasis and lower lung volumes status-post extubation,1
atelectasis Right infrahilar atelectasis,1
adjacent relaxation atelectasis persist,1


In [14]:
atelectasis_df1=atelectasis_df[atelectasis_df['text']!="atelectasis"]
atelectasis_df1=atelectasis_df1[atelectasis_df1['text']!="No atelectasis information"]

In [15]:
atelectasis_df1['text'].value_counts()

Unnamed: 0_level_0,count
text,Unnamed: 1_level_1
bibasilar atelectasis,1072
lower lobe atelectasis,831
compressive atelectasis,806
Bibasilar atelectasis,805
basilar atelectasis,585
...,...
Equivocal minimal new linear atelectasis,1
right lung atelectasis left lower lung volume loss,1
"Thin, plate-like bilateral atelectasis",1
Opacification in the right lower lobe could be atelectasis alone or there could be pneumonia. Left lower lobe is chronically atelectatic reflected in elevation of the left hemidiaphragm and leftward mediastinal shift.,1


In [16]:
atelectasis_df1=atelectasis_df1.sample(frac=1)

In [17]:
atelectasis_df_final=atelectasis_df1[['path','text','Report Impression']]

In [18]:
atelectasis_df_code=atelectasis_df1[['path','text']]

In [19]:
atelectasis_df_code.head(9)

Unnamed: 0,path,text
86678,../input/curated-cxr-report-generation-dataset...,Minimal patchy atelectasis
16630,../input/curated-cxr-report-generation-dataset...,mild basilar atelectasis
48462,../input/curated-cxr-report-generation-dataset...,adjacent atelectasis
31732,../input/curated-cxr-report-generation-dataset...,the right suggesting atelectasis
49412,../input/curated-cxr-report-generation-dataset...,minimal linear bibasilar atelectasis
65995,../input/curated-cxr-report-generation-dataset...,basilar atelectasis
10947,../input/curated-cxr-report-generation-dataset...,Compressive atelectasis
25080,../input/curated-cxr-report-generation-dataset...,most likely reflecting atelectasis
13732,../input/curated-cxr-report-generation-dataset...,Mild bibasal atelectasis


In [20]:
atelectasis_df_final.to_csv('atelectasis.csv',index=False)
atelectasis_df_code.to_csv('atelectasis_captions.txt',index=False)

In [21]:
atelectasis_df_full=atelectasis_df_final[['path','Report Impression']]

In [22]:
atelectasis_df_full.to_csv('atelectasis.txt',index=False)

In [23]:
len(atelectasis_df_full)

22794

In [24]:
atelectasis_df_final.head()

Unnamed: 0,path,text,Report Impression
86678,../input/curated-cxr-report-generation-dataset...,Minimal patchy atelectasis,Left-sided Port-A-Cath tip terminates at the S...
16630,../input/curated-cxr-report-generation-dataset...,mild basilar atelectasis,Single portable AP upright chest radiograph de...
48462,../input/curated-cxr-report-generation-dataset...,adjacent atelectasis,Mild to moderate pulmonary edema and large rig...
31732,../input/curated-cxr-report-generation-dataset...,the right suggesting atelectasis,Endotracheal tube terminates 5.6 cm above the ...
49412,../input/curated-cxr-report-generation-dataset...,minimal linear bibasilar atelectasis,"Since a recent radiograph of ___, the patient ..."
