In [49]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import re
import numpy as np
from collections import defaultdict

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

## Read SQL output into DF

In [115]:
def get_df_from_psv(fout=None):
    with open('data/output.psv', encoding="utf-8") as f:
        strr = f.read()
    df_str = re.sub(r'\n(?![0-9])', ' ', strr)

    df = pd.read_csv(io.StringIO(df_str), sep='|', error_bad_lines=False) #bad lines: [4399, 170914, 247161, 264812, 384966, 400521, 522365, 527597, 651243, 747036]

    # at least one row had the title in label col. 
    df.art_arttitle = np.where(df.art_arttitle.isnull() & df.label.notnull(), df.label, df.art_arttitle)

    df.columns = ['pmid', 'title', 'abstract', 'label']
    df.abstract = df.abstract.fillna('')

    # combine sectioned abstracts
    s=df.pmid.value_counts()
    mult_pmid = df[df.pmid.isin(s.index[s>1])]
    single_pmid = df[~df.pmid.isin(s.index[s>1])]
    mult_pmid2 = mult_pmid.groupby(['pmid']).abstract.transform(lambda x: ' '.join(x))
    mult_pmid['abstract'] = mult_pmid2
    mult_pmid = mult_pmid.drop_duplicates(subset=['pmid','abstract'])
    data_df = single_pmid.append(mult_pmid)

    data_df['content'] = np.where(data_df.title!=data_df.abstract, data_df.title+' '+data_df.abstract, data_df.abstract)

    if not fout:
        import datetime
        fout = "data/datadf_"+str(datetime.datetime.now()).replace(' ','_').replace(':','')+".pkl"
    data_df.to_pickle(fout)

    return data_df

## Remove stopwords
Hyphens - While generating hyperspace, tokenize word into nonhyphenated constituents. When looking up, sum up vecs of each constituent

In [463]:
def remove_stopwords(s):
    STOPWORDS = stopwords.words('english')+["'s", "’s"]+[x.capitalize() for x in stopwords.words('english')]
    
    s = re.sub(r'([a-zA-Z])[\-\/]([a-zA-Z])', r'\1 \2', s) # split hyphenated and slashed words
    s = re.sub(r'[^A-Za-z\s\(\)]+', '', s)
    tokens = word_tokenize(s)
    okstr = ' '.join([x for x in tokens if x in ['(',')'] or x not in STOPWORDS and len(x)>1])
    #only remove parentheses which are empty
    okstr = re.sub(r'\(\s*\)','',okstr)
    return okstr

## Identify abbreviation full-forms from text, and expand

In [29]:
ABBR_LIST = defaultdict(list)
def abbr_expander(s):   
    res = re.findall(r'\(\s?[A-Z]\s?[A-Za-z]+\s?\)', s) # Misses acronyms with hyphen
    outs = s
    retstr = ''
    for abbr in res:
        ix = s.index(abbr)
        abbr_clean = re.sub(r'[^A-Za-z]', '', abbr) # keep only letters, discard all other chars
        abbr_len = len(re.sub(r'[a-z]','',abbr_clean)) # Number of capital letters ~ number of tokens to look back for expansion
        if abbr_clean[-1]=='s': abbr_len-=1 # Singularing a plural acronym
        # When looking for expansion candidates, consider (-) and (/) as word-breaks
        expanded = ' '.join(s[ix-1::-1][::-1].rstrip().split(' ')[-abbr_len:]) # identify possible candidates for expansion from preivous abbr_len words
        ABBR_LIST[abbr_clean].append(expanded)
        outs = outs.replace(' '+abbr, '') # Remove first instance of abbreviation before expanding other instances
        if abbr_clean[-1]=='s':
            outs = outs.replace(abbr_clean[:-1], expanded)
        else:
            outs = outs.replace(abbr_clean, expanded) 
        
    return outs


## Get doc-per-line file

In [42]:
def clean_line_generator():
    data_df = get_df_from_psv()
    for abst in iter(data_df.content):
        abst = abbr_expander(abst)
        # Make post-period uppercase chars lower
        abst = re.sub(r"(?<=\. )[A-Z]",lambda t:t.group().lower(), abst)
        abst = remove_stopwords(abst)
        f.write(abst+'\n')

# TODO : Tag and keep only NOUNS and NOUNPHRASES





## Generator (iterator) for reading abstracts

In [53]:
class sentenceGenerator:
    def __init__(self, f, limit=None):
        self.f=f
        self.li = limit
    def __iter__(self):
        c=0
        for l in open(self.f):
            if self.li and c>=self.li:
                break
            c+=1
            yield l.split()
            

In [60]:
import time
start = time.time()
fname = 'data/oapl_sample.txt'
sg=sentenceGenerator(fname)
model = Word2Vec(sg, size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

duration = time.time()-start
print(duration)

5.824838876724243


In [61]:
word_vectors.vocab

{'Buccal': <gensim.models.keyedvectors.Vocab at 0x1c85cc28208>,
 'cellulitis': <gensim.models.keyedvectors.Vocab at 0x1c85cc28508>,
 'reevaluated': <gensim.models.keyedvectors.Vocab at 0x1c85cc28a08>,
 'studied': <gensim.models.keyedvectors.Vocab at 0x1c85cc28588>,
 'children': <gensim.models.keyedvectors.Vocab at 0x1c85c880088>,
 'prospectively': <gensim.models.keyedvectors.Vocab at 0x1c85c880748>,
 'acute': <gensim.models.keyedvectors.Vocab at 0x1c85c880248>,
 'buccal': <gensim.models.keyedvectors.Vocab at 0x1c85c880288>,
 'median': <gensim.models.keyedvectors.Vocab at 0x1c85c8806c8>,
 'age': <gensim.models.keyedvectors.Vocab at 0x1c85c8803c8>,
 'months': <gensim.models.keyedvectors.Vocab at 0x1c85c8808c8>,
 'Fifty': <gensim.models.keyedvectors.Vocab at 0x1c85c880c48>,
 'five': <gensim.models.keyedvectors.Vocab at 0x1c85c880d08>,
 'percent': <gensim.models.keyedvectors.Vocab at 0x1c85c880c88>,
 'patients': <gensim.models.keyedvectors.Vocab at 0x1c85c880808>,
 'bacteremic': <gensim.mo

In [62]:
word_vectors.most_similar('cerebral')

[('vasospasm', 0.8492591381072998),
 ('microbleeds', 0.839863121509552),
 ('hypoperfusion', 0.8316357135772705),
 ('Cerebral', 0.8221175074577332),
 ('Duret', 0.8086600303649902),
 ('cranio', 0.7962836027145386),
 ('infarct', 0.791989803314209),
 ('ischaemic', 0.7911811470985413),
 ('infarcts', 0.7869277000427246),
 ('Mesenteric', 0.7867780923843384)]