# Index of contents 

1. [Pre-process the reports](#1.-Pre-process-the-reports) <br>
2. [Ngram analysis](#2.-Ngram-analysis)<br>
3. [Searching for a keyword](#3.-Searching-for-a-keyword)

# Steps

1. Prepare the reports to be processed into a csv file with 2 columns - `caseID` and `Report text` and save as `ReportText.csv`
2. Pre-process them to remove the stop words, split them into lines and save the `mapping`
3. Perform ngram analysis to undestand the most significant words used in the reports
4. Use ngram analysis results to find the keyword of the abnormlaity you want
5. Understand the negations and their usage in the reports from the ngram analysis
6. Use the key word, its negations to find the reports related to the abnormality

# 1. Pre-process the reports

In [1]:
import numpy as np
import pandas as pd

def split_report_text_to_lines(txt, para_splitter='\r\n', line_splitter='.'):
    paras = txt.split(para_splitter)
    paras = list(filter(None, paras))
    
    # split the paragraphs to lines
    lines = []
    for p in paras:
        lines += p.split(line_splitter)
    
    # remove blank lines
    if ' ' in lines:
        lines.remove(' ')
    lines = list(filter(None, lines))
    
    return lines

# load the reports csv/excel file (use pd.read_excel if its an excel file)
reports = pd.read_csv('ReportText.csv')
patient_ids = np.array(reports.iloc[:,0])
report_texts = np.array(reports.iloc[:,1])

# make a mapping of all preprocessed report texts
mapping = {}
for i,pat_id in enumerate(patient_ids):
    lines = split_report_text_to_lines(report_texts[i])
    
    # checking if the report text is empty
    if len(lines) > 1:
        # removing the first line - unnecessary line
        lines = lines[1:]
        
        # converting all the text to lower case
        lines = [l.lower() for l in lines]
        
        mapping[pat_id] = lines
np.save('mapping.npy', mapping)

# 2. Ngram analysis

In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
    
# Remove the unnecessary words from a line
def remove_words(l):
    words_to_remove = ['left', 'right', 'middle', 'lobe', 'lobes', 
                       'upper', 'lower', 'bilateral', 
                        '(', ')', ':',';','>','<','``','&', 
                        'results', 'impression', 'adv', 'conclusion', 'seen', 'small', 'large', 'mediastinum', 'mediastinal', 'trachea', 'bronchi', 'main', 'taken']
    stopWords = set(stopwords.words('english'))
    
    for w in words_to_remove:
        l = l.replace(w,'')
    
    # toeknize the words
    words = word_tokenize(l)
    
    # remove stop words
    words = [w for w in words if w not in stopWords]
    
    return words

# Retrn uni, bi, tri and quad grams
def get_ngrams(lines):
    # Initiate the counters 
    ug_frequencies, bg_frequencies = Counter([]), Counter([])
    tg_frequencies, qg_frequencies = Counter([]), Counter([])
    
    # find ngrams for each line and append
    for l in lines:
        l = l.replace(',', ' ')
        # remove unneccessary words
        words = remove_words(l)
        
        ug, bg = ngrams(words,1), ngrams(words,2)
        tg, qg = ngrams(words,3), ngrams(words,4)
        ug_frequencies += Counter(ug)
        bg_frequencies += Counter(bg)
        tg_frequencies += Counter(tg)
        qg_frequencies += Counter(qg)
    
    return ug_frequencies, bg_frequencies, tg_frequencies, qg_frequencies

In [15]:
# load the preprocessed mapping
mapping = np.load('mapping.npy', allow_pickle=True)[()]

# collate lines of all reports
all_lines = []
for patient_id in mapping:
    all_lines += mapping[patient_id]

unigram_freq, bigram_freq, trigram_freq, quadgram_freq = get_ngrams(all_lines)

# Get the most common frequencies
ug = np.array(unigram_freq.most_common(100))
bg = np.array(bigram_freq.most_common(100))
tg = np.array(trigram_freq.most_common(100))
qg = np.array(quadgram_freq.most_common(100))

df = pd.DataFrame({'Unigrams':ug[:,0], 'Unigram frequencies':ug[:,1], 
                    'Bigrams':bg[:,0], 'Bigram frequencies':bg[:,1], 
                    'Trigrams':tg[:,0], 'Trigram frequencies':tg[:,1], 
                    'Quadgrams':qg[:,0], 'Quadgram frequencies':qg[:,1]})
df = df[['Unigrams','Unigram frequencies','Bigrams','Bigram frequencies',
         'Trigrams','Trigram frequencies','Quadgrams','Quadgram frequencies']]
df.to_excel('ngramAnalysis.xlsx', index=False)

Open the `ngramAnalysis.xlsx` and understand the occurance of various keywords, their negations etc.

# 3. Searching for a keyword 

In [3]:
def check_presence(lines, ORsearchTags, negations, ANDsearchTags = None):
    presence = False   # indicative of that word's presence
    for l in lines:
        # OR condition on searchtags
        cond = False
        for word in ORsearchTags:
            if word in l:
                cond = True
                break
        
        # AND condition on searchtags
        if ANDsearchTags is not None:
            for word in ANDsearchTags:
                if word not in l:cond = False
        
        # Check for negations if above two are true
        if cond:
            for word in negations:
                if word in l:
                    cond = False
                    break
                    
        # if true, set presense to True and break
        if cond:
            presence = True
            break
    return presence

In [4]:
mapping = np.load("mapping.npy", allow_pickle=True)[()]
negations = [" no ", " not ", " normal ", " unremarkable ",
            " clear ", " absent ", " absence "]
fibrosis_cases = []
for pat_id in mapping:
    # checking for ground glass haze
    cond1 = check_presence(
        mapping[pat_id], 
        ORsearchTags=["patchy", "haz"], 
        negations=negations, 
        ANDsearchTags=["ground glass"]
    )
    
    # checking for traction bronchiectasis
    cond2 = check_presence(
        mapping[pat_id],
        ORsearchTags=["traction", "dilatation"],
        negations=negations,
        ANDsearchTags=["bronchi"],
    )
    
    # checking for reticulations
    cond3 = check_presence(mapping[pat_id], ["reticula"], negations)
    
    # checking for direct reference to fibrosis
    cond4 = check_presence(mapping[pat_id], ["fibrotic", "fibrosis"], negations)

    fibrosis_presence = cond1 or cond2 or cond3 or cond4
    if fibrosis_presence:
        fibrosis_cases.append(pat_id)
print("Number of cases with fibrosis - ", len(fibrosis_cases))

Number of cases with fibrosis -  2631


In [5]:
df = pd.DataFrame({"Patient ID's with fibrosis": fibrosis_cases})
df.to_csv('fibrosis_IDs.csv', index=False)