In [1]:
import pandas as pd
import numpy as np
import random

import nltk

import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import torch

In [2]:
from nltk.corpus import stopwords
stopwords_list = set(stopwords.words('english'))
#stopwords_list

In [3]:
#Reading the metadata csv file
meta_data = pd.read_csv('combined_cxr_metadata.csv')
meta_data.head()

Unnamed: 0,Index,filename,OpacityScoreGlobal,GeographicScoreGlobal,BrixiaScoreGlobal,OpacityScoreGlobalFromBrixia,Dataset,patientid,offset,sex,...,location,clinical_notes,other_notes,Manufacturer,PhotometricInterpretation,ConsensusTestset,Sex,AgeAtStudyDateFiveYear,TableMotion,TableAngle
0,0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,1.3,2.3,,,1,2,0.0,M,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
1,1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,3.0,4.0,,,1,2,3.0,M,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
2,2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,3.0,4.7,,,1,2,5.0,M,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
3,3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,3.7,5.7,,,1,2,6.0,M,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
4,4,nejmc2001573_f1a.jpeg,2.0,2.0,2.0,1.21,1,4,0.0,F,...,"Changhua Christian Hospital, Changhua City, Ta...",diffuse infiltrates in the bilateral lower lungs,,,,,,,,


In [4]:
meta_data.drop(['Index'], axis=1, inplace=True)

In [5]:
meta_data.columns

Index(['filename', 'OpacityScoreGlobal', 'GeographicScoreGlobal',
       'BrixiaScoreGlobal', 'OpacityScoreGlobalFromBrixia', 'Dataset',
       'patientid', 'offset', 'sex', 'age', 'finding', 'RT_PCR_positive',
       'survival', 'intubated', 'intubation_present', 'went_icu', 'in_icu',
       'needed_supplemental_O2', 'extubated', 'temperature', 'pO2_saturation',
       'leukocyte_count', 'neutrophil_count', 'lymphocyte_count', 'view',
       'modality', 'date', 'location', 'clinical_notes', 'other_notes',
       'Manufacturer', 'PhotometricInterpretation', 'ConsensusTestset', 'Sex',
       'AgeAtStudyDateFiveYear', 'TableMotion', 'TableAngle'],
      dtype='object')

In [6]:
#total entries
len(meta_data)

7162

In [7]:
#Nan values in different columns
meta_data.isna().sum()

filename                           0
OpacityScoreGlobal                 0
GeographicScoreGlobal           4695
BrixiaScoreGlobal               2402
OpacityScoreGlobalFromBrixia    7097
Dataset                            0
patientid                          1
offset                          7084
sex                             7086
age                             7094
finding                         7069
RT_PCR_positive                 7069
survival                        7127
intubated                       7138
intubation_present              7138
went_icu                        7137
in_icu                          7144
needed_supplemental_O2          7150
extubated                       7162
temperature                     7150
pO2_saturation                  7141
leukocyte_count                 7159
neutrophil_count                7161
lymphocyte_count                7160
view                            4172
modality                        2374
date                               1
l

In [8]:
print(f' total available clinical notes:  {len(meta_data) - meta_data.clinical_notes.isna().sum() } ')
print(f' total available other notes:  {len(meta_data) - meta_data.other_notes.isna().sum() } ')

 total available clinical notes:  161 
 total available other notes:  183 


In [9]:
# filling of NaN values with empty strings
meta_data['clinical_notes'] = meta_data['clinical_notes'].fillna('')
meta_data['other_notes'] = meta_data['other_notes'].fillna('')

In [10]:
meta_data['clinical_notes']

0       On January 22, 2020, a 65-year-old man with a ...
1       On January 22, 2020, a 65-year-old man with a ...
2       On January 22, 2020, a 65-year-old man with a ...
3       On January 22, 2020, a 65-year-old man with a ...
4        diffuse infiltrates in the bilateral lower lungs
                              ...                        
7157                                                     
7158                                                     
7159                                                     
7160                                                     
7161                                                     
Name: clinical_notes, Length: 7162, dtype: object

In [11]:
#checking whether NaN entries in clinical_notes and other_notes
print(meta_data['clinical_notes'].isna().sum())
print(meta_data['other_notes'].isna().sum())

0
0


## NLP Text Data Augumentation

Usually four methods have been used for text data augumentation:

synonym_replacement, random_deletion, random_swap, random_insertion

Random deletion performs very poorly on clinical notes and results non-useful entries after agumenatation. Not included in the present work. 

codes have been adapted from https://maelfabien.github.io/machinelearning/NLP_8/#data-augmentation-techniques

Note that I have performed text agumentation only on clinical notes.

In [12]:
from nltk.corpus import wordnet 

def get_synonyms(word):
    """
    Get synonyms of a word
    """
    synonyms = set()
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

In [13]:
def synonym_replacement(words, n):
    '''
    Synonym Replacement: Randomly choose n words from the sentence that are not stop words. 
    Replace each of these words with one of its synonyms chosen at random.
    
    '''
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stopwords_list]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

In [14]:
def random_deletion(words, p):
    '''
    Random Deletion: Randomly remove each word in the sentence with probability p.
    
    '''

    words = words.split()
    
    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    sentence = ' '.join(new_words)
    
    return sentence

In [15]:
def swap_word(new_words):
    
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        
        if counter > 3:
            return new_words
    
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

def random_swap(words, n):
    
    '''
    Random Swap: Randomly choose two words in the sentence and swap their positions. Do this n times.
    
    '''
    
    words = words.split()
    new_words = words.copy()
    
    for _ in range(n):
        new_words = swap_word(new_words)
        
    sentence = ' '.join(new_words)
    
    return sentence

In [16]:
def random_insertion(words, n):
    '''
    Random Insertion: Find a random synonym of a random word in the sentence that is not a stop word. 
    Insert that synonym into a random position in the sentence. Do this n times.
    
    '''
    
    words = words.split()
    new_words = words.copy()
    
    for _ in range(n):
        add_word(new_words)
        
    sentence = ' '.join(new_words)
    return sentence

def add_word(new_words):
    
    synonyms = []
    counter = 0
    
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
        
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

In [17]:
meta_data

Unnamed: 0,filename,OpacityScoreGlobal,GeographicScoreGlobal,BrixiaScoreGlobal,OpacityScoreGlobalFromBrixia,Dataset,patientid,offset,sex,age,...,location,clinical_notes,other_notes,Manufacturer,PhotometricInterpretation,ConsensusTestset,Sex,AgeAtStudyDateFiveYear,TableMotion,TableAngle
0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,1.30,2.3,,,1,2,0.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,3.00,4.0,,,1,2,3.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,3.00,4.7,,,1,2,5.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,3.70,5.7,,,1,2,6.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
4,nejmc2001573_f1a.jpeg,2.00,2.0,2.0,1.21,1,4,0.0,F,52.0,...,"Changhua Christian Hospital, Changhua City, Ta...",diffuse infiltrates in the bilateral lower lungs,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7157,10409101678672828001.jpg,0.93,,1.0,,2,P1071,,,,...,A.O. SPEDALI CIVILI - BRESCIA,,,SIEMENS,MONOCHROME2,0.0,M,10.0,STATIC,0.0
7158,282205201992683717.jpg,3.42,,10.0,,2,P1071,,,,...,SPEDALI CIVILI BRESCIA,,,CARESTREAM HEALTH,MONOCHROME1,0.0,M,10.0,,
7159,9504330565338059035.jpg,4.80,,15.0,,2,P0193,,,,...,Spedali Civili di Brescia,,,SIEMENS,MONOCHROME2,0.0,M,14.0,STATIC,0.0
7160,8824145367570793429.jpg,5.07,,16.0,,2,P0193,,,,...,SPEDALI CIVILI BRESCIA,,,CARESTREAM HEALTH,MONOCHROME1,0.0,M,14.0,,


In [18]:
# Some analysis on clinical notes such as empty entry, total words 
a = 0
clinical_notes_tot_words = []
for index, row in meta_data.iterrows():
    #print(index, row)
    #print(index, row['clinical_notes'])
    row_entry  = row['clinical_notes']
    if len(row_entry) ==0:
        a = a+1
    else:
        tot_words = len(row_entry.split())
        clinical_notes_tot_words.append(tot_words)
        if tot_words < 3:
            print(index,row_entry)
print(a)
print(clinical_notes_tot_words)
print('\n')

print(sorted(clinical_notes_tot_words))

104 ?effusions/atelectasis bilaterally
105 ?atelectais/effusions?
212 ?bibasilar atelectasis
514 pacer pads
627 ?pleural effsions
722 Bibasilar pneumonia
1224 rotated
1227 ?pleural effusions
1467 ABDOMEN
1541 pp
1702 Abdomen
1757 Abdomen
1787 Abdomen
1827 Abdomen
7001
[125, 160, 160, 113, 7, 15, 93, 93, 139, 75, 31, 33, 35, 48, 67, 67, 67, 55, 29, 15, 46, 17, 33, 27, 24, 21, 7, 45, 45, 53, 120, 79, 65, 68, 54, 69, 67, 109, 37, 32, 53, 9, 50, 107, 107, 36, 36, 55, 55, 19, 38, 20, 13, 15, 37, 37, 101, 37, 28, 86, 86, 7, 80, 102, 111, 252, 88, 28, 23, 25, 39, 62, 6, 28, 50, 57, 62, 30, 39, 31, 46, 35, 23, 18, 19, 24, 23, 20, 21, 18, 55, 146, 7, 8, 9, 9, 2, 1, 8, 5, 4, 4, 6, 3, 2, 4, 3, 3, 6, 5, 4, 6, 6, 3, 6, 3, 2, 3, 2, 9, 7, 2, 3, 7, 7, 7, 13, 8, 6, 7, 9, 3, 4, 4, 6, 3, 4, 4, 3, 5, 5, 1, 2, 3, 7, 4, 11, 1, 1, 10, 1, 3, 4, 5, 3, 3, 1, 1, 1, 4, 3]


[1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,

In [19]:
meta_data

Unnamed: 0,filename,OpacityScoreGlobal,GeographicScoreGlobal,BrixiaScoreGlobal,OpacityScoreGlobalFromBrixia,Dataset,patientid,offset,sex,age,...,location,clinical_notes,other_notes,Manufacturer,PhotometricInterpretation,ConsensusTestset,Sex,AgeAtStudyDateFiveYear,TableMotion,TableAngle
0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,1.30,2.3,,,1,2,0.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,3.00,4.0,,,1,2,3.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,3.00,4.7,,,1,2,5.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,3.70,5.7,,,1,2,6.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
4,nejmc2001573_f1a.jpeg,2.00,2.0,2.0,1.21,1,4,0.0,F,52.0,...,"Changhua Christian Hospital, Changhua City, Ta...",diffuse infiltrates in the bilateral lower lungs,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7157,10409101678672828001.jpg,0.93,,1.0,,2,P1071,,,,...,A.O. SPEDALI CIVILI - BRESCIA,,,SIEMENS,MONOCHROME2,0.0,M,10.0,STATIC,0.0
7158,282205201992683717.jpg,3.42,,10.0,,2,P1071,,,,...,SPEDALI CIVILI BRESCIA,,,CARESTREAM HEALTH,MONOCHROME1,0.0,M,10.0,,
7159,9504330565338059035.jpg,4.80,,15.0,,2,P0193,,,,...,Spedali Civili di Brescia,,,SIEMENS,MONOCHROME2,0.0,M,14.0,STATIC,0.0
7160,8824145367570793429.jpg,5.07,,16.0,,2,P0193,,,,...,SPEDALI CIVILI BRESCIA,,,CARESTREAM HEALTH,MONOCHROME1,0.0,M,14.0,,


### Data augumentation steps

n_iter is the variable decides the number of loops for each type of data augumenation 

if the total words in the clincal notes is between 2 and 5, total agumenations will be 2; between 5 and 10, three augumeation with 2, 4, 6 words; between 10 and 25, three augumenation with 2, 4, 8 words and above 25, 5 augumnenations with 4, 8, 12, 16, 20 words.

Each row with the clinical notes (without empty strings) appended to the end of the dataframe with augumented clinical notes. Only clinical notes will be changed for these rows. 

Each iteration generates 585 augumented texts. Total augumented texts from three different augumentation process will be 1755. 

Random deletion performs very poorly on clinical notes and results non-useful entries after agumenatation. Not included in the present work.


In [20]:
n_iter = 1 # Change this variable for more augumenation.

In [21]:
def append_random_swap_augtxt(index, row_entry, q, new_df):
    x1=meta_data.loc[[index],:]
    augmented_text = random_swap(row_entry, q)
    x1.loc[index,'clinical_notes'] = augmented_text
    new_df = pd.concat([new_df,x1], ignore_index=True)
    #print(index,row_entry)
    #print(augmented_text)
    #print('\n')
    return new_df
   
new_df = meta_data.copy()
count_a, count_b, count_c, count_d = 0, 0, 0, 0


for i in range(n_iter):
    for index, row in meta_data.iterrows():
        row_entry  = row['clinical_notes']
    
        if len(row_entry) !=0:
            tot_words = len(row_entry.split())
        
            if tot_words > 1 and tot_words < 5:
                count_a += 1
                new_df = append_random_swap_augtxt(index, row_entry, 2, new_df)
            
            elif tot_words > 5 and tot_words < 10:
                count_b += 1
                new_df = append_random_swap_augtxt(index, row_entry, 2, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 4, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 6, new_df)

            elif tot_words > 10 and tot_words < 25:
                count_c += 1
                new_df = append_random_swap_augtxt(index, row_entry, 2, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 4, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 8, new_df)
            
            else:
                count_d += 1
                new_df = append_random_swap_augtxt(index, row_entry, 4, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 8, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 12, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 16, new_df)
                new_df = append_random_swap_augtxt(index, row_entry, 20, new_df)
            
print(count_a, count_b, count_c, count_d)
total = count_a + 3 * count_b + 3 * count_c + 5 * count_d
print("total augumentations:", total)

32 26 20 83
total augumentations: 585


In [22]:
#checking the total length of the dataframe after adding agumented text.
len(new_df)

7747

In [23]:
new_df['clinical_notes']

0       On January 22, 2020, a 65-year-old man with a ...
1       On January 22, 2020, a 65-year-old man with a ...
2       On January 22, 2020, a 65-year-old man with a ...
3       On January 22, 2020, a 65-year-old man with a ...
4        diffuse infiltrates in the bilateral lower lungs
                              ...                        
7742                                              Abdomen
7743                                              Abdomen
7744                                              Abdomen
7745                             included lungs upper not
7746                             left basilar atelectasis
Name: clinical_notes, Length: 7747, dtype: object

In [24]:
def append_synonym_replace_augtxt(index, row_entry, q, new_df):
    x1=meta_data.loc[[index],:]
    augmented_text = synonym_replacement(row_entry, q)
    x1.loc[index,'clinical_notes'] = augmented_text
    new_df = pd.concat([new_df,x1], ignore_index=True)
    #print(index,row_entry)
    #print(augmented_text)
    #print('\n')
    return new_df
   
count_a, count_b, count_c, count_d = 0, 0, 0, 0

for i in range(n_iter):
    for index, row in meta_data.iterrows():
        row_entry  = row['clinical_notes']
    
        if len(row_entry) !=0:
            tot_words = len(row_entry.split())
        
            if tot_words > 1 and tot_words < 5:
                count_a += 1
                new_df = append_synonym_replace_augtxt(index, row_entry, 2, new_df)
            
            elif tot_words > 5 and tot_words < 10:
                count_b += 1
                new_df = append_synonym_replace_augtxt(index, row_entry, 2, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 4, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 6, new_df)

            elif tot_words > 10 and tot_words < 25:
                count_c += 1
                new_df = append_synonym_replace_augtxt(index, row_entry, 2, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 4, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 8, new_df)
            
            else:
                count_d += 1
                new_df = append_synonym_replace_augtxt(index, row_entry, 4, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 8, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 12, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 16, new_df)
                new_df = append_synonym_replace_augtxt(index, row_entry, 20, new_df)
            
print(count_a, count_b, count_c, count_d)
total = count_a + 3 * count_b + 3 * count_c + 5 * count_d
print("total augumentations:", total)

32 26 20 83
total augumentations: 585


In [25]:
len(new_df)

8332

In [26]:
def append_random_insertion_augtxt(index, row_entry, q, new_df):
    x1=meta_data.loc[[index],:]
    augmented_text = random_insertion(row_entry, q)
    x1.loc[index,'clinical_notes'] = augmented_text
    new_df = pd.concat([new_df,x1], ignore_index=True)
    #print(index,row_entry)
    #print(augmented_text)
    #print('\n')
    return new_df
   
count_a, count_b, count_c, count_d = 0, 0, 0, 0


for i in range(n_iter):
    for index, row in meta_data.iterrows():
        row_entry  = row['clinical_notes']
    
        if len(row_entry) !=0:
            tot_words = len(row_entry.split())
        
            if tot_words > 1 and tot_words < 5:
                count_a += 1
                new_df = append_random_insertion_augtxt(index, row_entry, 2, new_df)
            
            elif tot_words > 5 and tot_words < 10:
                count_b += 1
                new_df = append_random_insertion_augtxt(index, row_entry, 2, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 4, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 6, new_df)

            elif tot_words > 10 and tot_words < 25:
                count_c += 1
                new_df = append_random_insertion_augtxt(index, row_entry, 2, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 4, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 8, new_df)
            
            else:
                count_d += 1
                new_df = append_random_insertion_augtxt(index, row_entry, 4, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 8, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 12, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 16, new_df)
                new_df = append_random_insertion_augtxt(index, row_entry, 20, new_df)

print(count_a, count_b, count_c, count_d)
total = count_a + 3 * count_b + 3 * count_c + 5 * count_d
print("total augumentations:", total)

32 26 20 83
total augumentations: 585


In [27]:
len(new_df)

8917

In [28]:
#new_df with appended agumented texts. 
new_df

Unnamed: 0,filename,OpacityScoreGlobal,GeographicScoreGlobal,BrixiaScoreGlobal,OpacityScoreGlobalFromBrixia,Dataset,patientid,offset,sex,age,...,location,clinical_notes,other_notes,Manufacturer,PhotometricInterpretation,ConsensusTestset,Sex,AgeAtStudyDateFiveYear,TableMotion,TableAngle
0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,1.3,2.3,,,1,2,0.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,3.0,4.0,,,1,2,3.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,3.0,4.7,,,1,2,5.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,3.7,5.7,,,1,2,6.0,M,65.0,...,"Cho Ray Hospital, Ho Chi Minh City, Vietnam","On January 22, 2020, a 65-year-old man with a ...",,,,,,,,
4,nejmc2001573_f1a.jpeg,2.0,2.0,2.0,1.21,1,4,0.0,F,52.0,...,"Changhua Christian Hospital, Changhua City, Ta...",diffuse infiltrates in the bilateral lower lungs,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8912,ralo_sbm_1733.jpg,2.0,3.5,,,3,S12864,,,,...,,stomach stand stomach venter stomach support s...,,,,,,,,
8913,ralo_sbm_1733.jpg,2.0,3.5,,,3,S12864,,,,...,,support hold up support stomach stand stomach ...,,,,,,,,
8914,ralo_sbm_1733.jpg,2.0,3.5,,,3,S12864,,,,...,,hold up hold up stand support stomach support ...,,,,,,,,
8915,ralo_sbm_1755.jpg,6.0,8.0,,,3,S12902,,,,...,,upper non lungs non not included,NGT,,,,,,,


## Text Preprocessing

In [29]:
#preprocess functions
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=  text.strip()  
    text=  re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text= re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Removes url
    text = re.compile(r'<[^>]+>').sub('', text) #Removes HTML tags:
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text) # Single character removal
    text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
    return text

# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

def finalpreprocess(string):
    final_preprocessed_text = lemmatizer(stopword(preprocess(string)))
    return final_preprocessed_text

In [30]:
df = new_df.copy()
df['preprocessed_clinical_notes']=df['clinical_notes'].map(lambda s:finalpreprocess(s)) 
#df['preprocessed_other_notes']=meta_data['other_notes'].map(lambda s:finalpreprocess(s)) 
df

Unnamed: 0,filename,OpacityScoreGlobal,GeographicScoreGlobal,BrixiaScoreGlobal,OpacityScoreGlobalFromBrixia,Dataset,patientid,offset,sex,age,...,clinical_notes,other_notes,Manufacturer,PhotometricInterpretation,ConsensusTestset,Sex,AgeAtStudyDateFiveYear,TableMotion,TableAngle,preprocessed_clinical_notes
0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,1.3,2.3,,,1,2,0.0,M,65.0,...,"On January 22, 2020, a 65-year-old man with a ...",,,,,,,,,january year old man history hypertension type...
1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,3.0,4.0,,,1,2,3.0,M,65.0,...,"On January 22, 2020, a 65-year-old man with a ...",,,,,,,,,january year old man history hypertension type...
2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,3.0,4.7,,,1,2,5.0,M,65.0,...,"On January 22, 2020, a 65-year-old man with a ...",,,,,,,,,january year old man history hypertension type...
3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,3.7,5.7,,,1,2,6.0,M,65.0,...,"On January 22, 2020, a 65-year-old man with a ...",,,,,,,,,january year old man history hypertension type...
4,nejmc2001573_f1a.jpeg,2.0,2.0,2.0,1.21,1,4,0.0,F,52.0,...,diffuse infiltrates in the bilateral lower lungs,,,,,,,,,diffuse infiltrate bilateral low lung
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8912,ralo_sbm_1733.jpg,2.0,3.5,,,3,S12864,,,,...,stomach stand stomach venter stomach support s...,,,,,,,,,stomach stand stomach venter stomach support s...
8913,ralo_sbm_1733.jpg,2.0,3.5,,,3,S12864,,,,...,support hold up support stomach stand stomach ...,,,,,,,,,support hold support stomach stand stomach ven...
8914,ralo_sbm_1733.jpg,2.0,3.5,,,3,S12864,,,,...,hold up hold up stand support stomach support ...,,,,,,,,,hold hold stand support stomach support venter...
8915,ralo_sbm_1755.jpg,6.0,8.0,,,3,S12902,,,,...,upper non lungs non not included,NGT,,,,,,,,upper non lung non include


In [31]:
X_train = df['preprocessed_clinical_notes']
X_train

0       january year old man history hypertension type...
1       january year old man history hypertension type...
2       january year old man history hypertension type...
3       january year old man history hypertension type...
4                   diffuse infiltrate bilateral low lung
                              ...                        
8912    stomach stand stomach venter stomach support s...
8913    support hold support stomach stand stomach ven...
8914    hold hold stand support stomach support venter...
8915                           upper non lung non include
8916           basilar basilary leave basilar atelectasis
Name: preprocessed_clinical_notes, Length: 8917, dtype: object

### Text Feature Extraction using CountVectorizer, TfidfVectorizer

The authors of the paper used the BoW model by citing the importance given to specific keywords in the clinical notes written by the nurse/doctor that can help identify symptoms of COVID-19. Since the main focus is on those keywords and it’s frequency in the text, BoW model has been used instead of RNNs and other NLP models. Hence, count vectorizer and tfidf vectorizer have been generated

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

count_vectors = CountVectorizer(max_features=10000, ngram_range=(1,3))
tfidf_vectors = TfidfVectorizer(max_features=10000, ngram_range=(1,3))

X_train_vectors_count = count_vectors.fit_transform(X_train)
X_train_vectors_tfidf = tfidf_vectors.fit_transform(X_train) 


In [33]:
type(X_train_vectors_tfidf)

scipy.sparse.csr.csr_matrix

In [34]:
X_train_vectors_tfidf_narray = X_train_vectors_tfidf.toarray()
print(X_train_vectors_tfidf_narray .shape)
tensor_X_train_vectors_tfidf_narray  = torch.from_numpy(X_train_vectors_tfidf_narray )
print(tensor_X_train_vectors_tfidf_narray.shape)
tensor_X_train_vectors_tfidf_narray

(8917, 10000)
torch.Size([8917, 10000])


tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0691, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [35]:
X_train_vectors_count_narray = X_train_vectors_count.toarray()
print(X_train_vectors_count_narray .shape)
tensor_X_train_vectors_count_narray  = torch.from_numpy(X_train_vectors_count_narray )
print(tensor_X_train_vectors_count_narray.shape)
tensor_X_train_vectors_count_narray

(8917, 10000)
torch.Size([8917, 10000])


tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])