In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load radiology report dataset from Kaggle
df = pd.read_csv('ReportsDATASET.csv')

In [4]:
# Preprocessing
def preprocess_text(text):
    # Tokenization
    sentences = sent_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        # Removing stopwords, special characters, and numerical data
        words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha()]
        preprocessed_sentences.append(' '.join(words))
    return ' '.join(preprocessed_sentences)

In [5]:
print(df.columns)

Index(['Text'], dtype='object')


In [6]:
# Apply preprocessing to radiology report
df['clean_text'] = df['Text'].apply(preprocess_text)

In [7]:
# Extract TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

In [8]:
# Compute pairwise cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
# TextRank for Extractive Summarization
def textrank_summary(text, num_sentences=3):
    from summa import summarizer
    return summarizer.summarize(text, words=num_sentences)

In [10]:
# Generate summaries for radiology reports using TextRank
df['summary'] = df['Text'].apply(lambda x: textrank_summary(x))

# # Save the summaries to a text file
# df['summary'].to_csv('ReportsDATASET.txt', index=False, header=None, sep='\n')


In [11]:
!pip install summa

Defaulting to user installation because normal site-packages is not writeable


In [12]:
# evaluationg through rogue evaluation metric
def evaluate_summaries(reference_summaries, generated_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return scores

In [13]:
def evaluate_summaries(reference_summaries, generated_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, reference_summaries)
    avg_scores = {k: sum(v) / len(v) for k, v in scores.items()}
    return avg_scores


In [14]:
def evaluate_summaries(reference_summaries, generated_summaries):
    rouge = Rouge()
    scores = []
    for hyp, ref in zip(generated_summaries, reference_summaries):
        try:
            score = rouge.get_scores(hyp, ref)
            scores.append(score)
        except ValueError:
            print(f"Error calculating ROUGE score for hypothesis '{hyp}' and reference '{ref}'")
            scores.append(None)
    avg_scores = {k: sum(v) / len(v) for k, v in scores.items() if v is not None}
    return avg_scores

In [16]:
# Save summarized dataset
# df.to_csv('reportsdataset_reports.csv', index=False)
df.to_csv('summarized_reports.csv', index=False)




In [17]:
 # Load summarized dataset from file
# df = pd.read_csv('reportsdataset_reports.csv')
df = pd.read_csv('summarized_reports.csv')
#  # Remove rows with NaN values in the 'summary' column
# df = df.dropna(subset=['summary'])

# Print summarized radiology reports
print(df.head())

                                                Text  \
0  \nChest PA-Lat XR\n\nImaging Study\nXray Chest...   
1  EXAM(S): Chest, 2 views, frontal and lateral\n...   
2  \nExam\nXray Chest PA and Lateral\n\nDate\nXXX...   
3  \nRADIOLOGY REPORT\n\nExamination\nPA and late...   
4  \nChest PA-Lat XR\n\nImaging Study\nXray Chest...   

                                          clean_text  \
0  chest xr imaging study xray chest pa and later...   
1  exam s chest view frontal and lateral date xxx...   
2  exam xray chest pa and lateral date xxxx histo...   
3  radiology report examination pa and lateral vi...   
4  chest xr imaging study xray chest pa and later...   

                        summary  
0          Normal chest x-XXXX.  
1  No acute pulmonary findings.  
2                           NaN  
3                           NaN  
4     Xray Chest PA and Lateral  


In [19]:
import pandas as pd

# Load summarized dataset from file
df = pd.read_csv('summarized_reports.csv')

# # Remove rows with NaN values in the 'summary' column
# df = df.dropna(subset=['summary'])

# Print summarized radiology reports
print(df['summary'])

0                    Normal chest x-XXXX.
1            No acute pulmonary findings.
2                                     NaN
3                                     NaN
4               Xray Chest PA and Lateral
                      ...                
1979                                  NaN
1980                                  NaN
1981                                  NaN
1982    Possible right upper lobe cavity.
1983                                 XXXX
Name: summary, Length: 1984, dtype: object
