## Pre-processing

The pre-processing steps include:
1. Select those pubmed records for which citation information is available
2. Tokenize abstracts and keywords using NLTK
3. Detect acronyms and replace them with full form
4. Trigram transformation
5. Part of speech (POS) tagging, lemmatization, stop-words removal using NLTK

In [1]:
from Bio import Medline
import nltk.data
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import gensim
import numpy as np
import nltk
import re
import pandas as pd
%matplotlib inline
from IPython.core.pylabtools import figsize
from matplotlib import pyplot as plt
figsize(11, 9)

#### Load in the data

In [2]:
fin = open('SWI_QSM_papers.txt','r')
records = Medline.parse(fin)
papers=list(records)
print('Found {0} paper records.'.format(len(papers)))
fin.close()
abstract = papers[0]['AB']

Found 945 paper records.


#### Extract the citation information
Create query keys using the first 7 words in title + year of publication. Note that sometimes, for the same paper, the year may appear to be different in Pubmed and google scholar search results. When extracting the citation information, we need to try the keys with year+/-1. 

In [3]:
citation_data=pd.read_pickle('CitationData.pickle')

In [4]:
#tokenizer using regular expression for citation extraction
tokenizer_word_reg = RegexpTokenizer("[\w]+")

In [5]:
#citation data
citation=dict()
year = dict()
for i in range(citation_data.shape[0]):
    temp = tokenizer_word_reg.tokenize(citation_data.iloc[i,:]['title'].lower().replace('-',' '))    
    key = '_'.join(temp[0:7]) if len(temp)>=7 else '_'.join(temp)
    key = key + '_' + str(citation_data.iloc[i,:]['year'])
    citation[key]=citation_data.iloc[i,:]['num_citations']

#### Tokenize the title, keywords and abstracts

In [6]:
def acronym_detection(text, delimiter='_', option=1):
    # function for detection potential acronyms. 
    # output: dictionary containing {acronym: full-form}
    # text are assumed to be tokenized already: [tokens]
    if option==2: #keywords
        text_temp = []
        for word in text:
            if ',' in word or '(' in word:
                word = nltk.word_tokenize(word)
                for w in word:
                    text_temp.append(w)
            else:
                text_temp.append(word)
        text = text_temp
    N= len(text) 
    terms=dict()
    for ind, word in enumerate(text):
        if word.isupper(): 
            m = len(word)
            if ind>=m+1:
                words_before = text[(ind- m -1):(ind-1)]
                a1 = [w[0].upper() for w in words_before]
                a1 = ''.join(a1)
                if a1==word:
                    terms[a1]=delimiter.join([w.lower() for w in words_before])
            if ind<=N-2-m:
                words_after = text[(ind+2):(ind+2+m)]
                a2 = [w[0].upper() for w in words_after]
                a2 = ''.join(a2)
                if a2==word:
                    terms[a2]=delimiter.join([w.lower() for w in words_after])
    return terms

Keywords may contain acronyms. The acronyms will be detected and subsequently removed, with the full form joined by '_'. 

In [7]:
def keyword_tokenizer(keywords, deliminator='_'):
    temp = keywords
    for j, kw in enumerate(temp):
        if ',' in kw:
            kw1=kw.split(',')
            for kw11 in kw1:
                if kw11.islower():
                    kw = kw11.strip().replace(' ', deliminator) if ' ' in kw11 else kw11
                    break
        kw = re.sub(r'\([^)]*\)', '', kw).strip().replace('-',' ').replace(' ', deliminator) #remove contents in parentheses
        temp[j] = kw.lower()
    return temp

In [8]:
N=len(papers)
texts = [] #abstract tokens
tokens_t = [] #titles token
tokens_k = [] #keywords token
num_citation = [] 
year = []
exclusion_list = [] #papers without keywords or abstracts or number of citations
acronyms = dict()
authors = []

In [9]:
count=0
for i in range(0, N):
    if 'AB' not in papers[i].keys():
        exclusion_list.append(i) #abstract is missing
        continue;
    ### 1. tokenize the titles. 
    temp_t = tokenizer_word_reg.tokenize(papers[i]['TI'].lower().replace('-',' ')) #initial tokens of the title
    
    ### 2. check the number of citations and year of publication
    key = '_'.join(temp_t[0:7]) if len(temp_t)>=7 else '_'.join(temp_t)
    
    for yy in [-1,0,1]:
        query_key = key + '_' + str(int(papers[i]['DA'][:4])+yy)
        temp_citation = citation.get(query_key,-1)
        if temp_citation != -1:
            break
    
    if temp_citation==-1:
        exclusion_list.append(i) #citation not available
        count+=1
        continue;
        
    tokens_t.append(temp_t)
    authors.append(papers[i]['AU'])
    num_citation.append(temp_citation)
    year.append(papers[i]['DA'][:4])
            
    ### 3. tokenize the abstract
    abstract = papers[i]['AB'].replace('-',' ') # remove the hiphen, for better detection of N-gram.
    abs_token= nltk.word_tokenize(abstract.lower()) # tokenize the abstract
    
    if 'OT' in papers[i].keys():
        ### 4. check acronyms in keywords
        acronyms_temp = acronym_detection(papers[i]['OT'], delimiter='_', option=2)
        if len(acronyms_temp)>0:
            acronyms.update(acronyms_temp)
        ### tokenize keywords
        tokens_k.append(keyword_tokenizer(papers[i]['OT']))
    else:
        tokens_k.append([])

    ### 5. check acronyms in abstract
    acronyms_temp = acronym_detection(abs_token, delimiter='_', option=1)
    if len(acronyms_temp)>0:
        acronyms.update(acronyms_temp)
    
    ### 6. tokenize the abstract after removing contents in parentheses
    abstract = re.sub(r'\([^)]*\)', '', abstract)
    abs_token= nltk.word_tokenize(abstract.lower()) # tokenize the abstract
    texts.append(abs_token)        
print('Finished tokenizing {} papers. There are {} papers without abstracts, {} papers without citation info.'
      .format(N-len(exclusion_list), len(exclusion_list)-count, count))
print('Detected {} acronyms.'.format(len(acronyms)))

Finished tokenizing 265 papers. There are 50 papers without abstracts, 630 papers without citation info.
Detected 13 acronyms.


#### Dealing with acronyms
Acronyms or abbreviations are usually used in research articles. We extract the acronyms which are defined in keywords or abstracts, and replace the acronyms with the full-form.

In [10]:
acronyms

{'CC': 'ccg_ccs',
 'DGM': 'deep_grey_matter',
 'DGMPM': 'deep_gray_matter_parcellation_map',
 'DTI': 'diffusion_tensor_imaging',
 'ICA': 'independent_component_analysis',
 'MRI': 'magnetic_resonance_imaging',
 'MSA': 'magnetic_susceptibility_anisotropy',
 'PDE': 'partial_differential_equation',
 'QSM': 'quantitative_susceptibility_mapping',
 'SS': 'segmentation_sophisticated harmonic artifact reduction for phase data',
 'STI': 'susceptibility_tensor_imaging',
 'SWI': 'susceptibility_weighted_imaging',
 'TBI': 'traumatic_brain_injury'}

In [11]:
#### 7. replace acronyms with full form. 
abbreviation = acronyms.keys()
texts1=[]
for sublist in texts:
    temp = []
    for item in sublist:
        if item.upper() in abbreviation:
            temp.append(acronyms[item.upper()])
        else:
            temp.append(item)
    texts1.append(temp)

#### Trigram transform

In [12]:
# Trigram transform
bigram = gensim.models.Phrases(texts1, threshold = 20)
trigram= gensim.models.Phrases(bigram[texts1], threshold = 20)

In [13]:
print(trigram[bigram[texts1[0]]])

['susceptibility_weighted_imaging', 'is', 'a', 'magnetic_resonance_imaging', 'technique', 'that', 'enhances', 'image', 'contrast', 'by', 'using', 'the', 'susceptibility', 'differences_between', 'tissues', '.', 'it', 'is', 'created', 'by', 'combining', 'both', 'magnitude', 'and', 'phase', 'in', 'the', 'gradient_echo', 'data', '.', 'susceptibility_weighted_imaging', 'is', 'sensitive', 'to', 'both', 'paramagnetic', 'and', 'diamagnetic', 'substances', 'which', 'generate', 'different', 'phase_shift', 'in', 'magnetic_resonance_imaging', 'data', '.', 'susceptibility_weighted_imaging', 'images', 'can_be', 'displayed', 'as', 'a', 'minimum', 'intensity', 'projection', 'that', 'provides', 'high_resolution', 'delineation', 'of', 'the', 'cerebral', 'venous', 'architecture', ',', 'a', 'feature', 'that', 'is', 'not', 'available', 'in', 'other', 'magnetic_resonance_imaging', 'techniques', '.', 'as', 'such', ',', 'susceptibility_weighted_imaging', 'has_been', 'widely', 'applied', 'to', 'diagnose', 'var

#### POS tagging

In [14]:
pos_option=1 #no JJ
texts2=[]
english_stops = set(stopwords.words('english'))

for i in range(len(texts1)):
    # POS tagging
    text_tag = nltk.pos_tag(list(trigram[bigram[texts1[i]]]))
    text_temp = []
    for word, tag in text_tag:
        if pos_option==1:
            if 'NN' in tag:
                text_temp.append(word)
        elif pos_option==2:
            if 'NN' in tag or 'VB' in tag:
                text_temp.append(word)
        else:
            if 'NN' in tag or 'VB' in tag or 'JJ' in tag:
                text_temp.append(word)
    # lemmatization, remove stop-words            
    lemmatizer = nltk.WordNetLemmatizer()
    text_temp = [lemmatizer.lemmatize(word) for word in text_temp]
    # remove stop-words, remove numbers
    text_temp = [word for word in text_temp if word not in english_stops and not word.isdigit() and word!= '%']
    
    #combine the abstract tokens and keywords tokens
    texts2.append(text_temp + tokens_k[i])

In [15]:
print(texts2[0])

['susceptibility_weighted_imaging', 'magnetic_resonance_imaging', 'technique', 'image', 'contrast', 'susceptibility', 'tissue', 'magnitude', 'phase', 'gradient_echo', 'data', 'susceptibility_weighted_imaging', 'substance', 'phase_shift', 'data', 'image', 'intensity', 'projection', 'high_resolution', 'delineation', 'architecture', 'feature', 'technique', 'abnormality', 'susceptibility_weighted_imaging', 'blood', 'mineral', 'deposition', 'reason', 'has_been', 'image', 'pathology', 'hemorrhage', 'traumatic_brain_injury', 'stroke', 'neoplasm', 'multiple_sclerosis', 'susceptibility_weighted_imaging', 'measure', 'magnetic_susceptibility', 'limitation', 'development', 'treat', 'susceptibility', 'isotropic', 'treat', 'susceptibility', 'tensor', 'quantity', 'this_article', 'principle', 'susceptibility_weighted_imaging', 'research', 'application', 'mechanism', 'brain', 'susceptibility', 'property', 'implementation', 'focus', 'brain', 'imaging', 'magnetic_resonance_imaging', 'magnetic_susceptibil

#### Saving the results

In [16]:
import pickle

In [17]:
pickle.dump((texts2, tokens_t, tokens_k, num_citation, year, acronyms), open("data/tokens_ab_key.p",'wb')) 