In [219]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import re
import numpy as np
from collections import defaultdict

## Some wrangling

In [53]:
txt = open('abstracts.csv','r',  encoding='utf-8')
out = []

for l in txt:
    try:
        comma = l.index(',')
    except ValueError:
        continue
    pmid = l[:comma]
    content = l[comma+1:]
    out.append(pmid+'\t'+content)
    
        
with open('abstracts_kv.tsv','w',encoding='utf-8') as f:
    f.writelines(out)
txt.close()

df = pd.read_csv('abstracts_kv.tsv',sep='\t')
df.columns = ['pmid', 'content']
df2 = df.content.str.split('\.,').apply(pd.Series, 1)
8180 pmids have multiple delims, ignoring them for now
df2 = df2[[0,1]]
df2.columns = ['title','abstract']
df2['pmid'] = df['pmid']

#handle abstracs which are null because title<->abstract delimiter was not .,
df2['abstract'] = np.where(df2.abstract.isnull(), df2.title, df2.abstract)
df3 = df2.dropna(how='any')

# combine sectioned abstracts
s=df3.pmid.value_counts()
mult_pmid = df3[df3.pmid.isin(s.index[s>1])]
single_pmid = df3[~df3.pmid.isin(s.index[s>1])]

mult_pmid2 = mult_pmid.groupby(['pmid']).abstract.transform(lambda x: ' '.join(x))
mult_pmid['abstract'] = mult_pmid2
mult_pmid = mult_pmid.drop_duplicates(subset=['pmid','abstract'])

data_df = single_pmid.append(mult_pmid)
data_df['content'] = np.where(data_df.title!=data_df.abstract, data_df.title+' '+data_df.abstract, data_df.abstract)

# data_df.to_pickle('data_df.pkl')

data = data_df[['pmid', 'content']]

## Remove stopwords
Hyphens - While generating hyperspace, tokenize word into nonhyphenated constituents. When looking up, sum up vecs of each constituent

In [463]:
STOPWORDS = stopwords.words('english')+["'s", "’s"]
def remove_stopwords(s):
    s = re.sub(r'([a-zA-Z])[\-\/]([a-zA-Z])', r'\1 \2', s) # split hyphenated and slashed words
    s = re.sub(r'[^A-Za-z\s\(\)]+', '', s)
    tokens = word_tokenize(s)
    okstr = ' '.join([x for x in tokens if x in ['(',')'] or x not in STOPWORDS and len(x)>1])
    #only remove parentheses which are empty
    okstr = re.sub(r'\(\s*\)','',okstr)
    return okstr

## Get Abbreviation in Dict

In [467]:
ABBR_LIST = defaultdict(list)

def abbr_expander(s):
    res = re.findall(r'\(\s?[A-Z]\s?[A-Za-z]+\s?\)', s) # Misses acronyms with hyphen
    outs = s
    retstr = ''
    for abbr in res:
        ix = s.index(abbr)
        abbr_clean = re.sub(r'[^A-Za-z]', '', abbr) # keep only letters, discard all other chars
        abbr_len = len(re.sub(r'[a-z]','',abbr_clean)) # Number of capital letters ~ number of tokens to look back for expansion
        if abbr_clean[-1]=='s': abbr_len-=1
        # When looking for expansion candidates, consider (-) and (/) as word-breaks
        expanded = ' '.join(s[ix-1:0:-1][::-1].rstrip().split(' ')[-abbr_len:])
        ABBR_LIST[abbr_clean].append(expanded)
        # Remove first instance of abbreviation before expanding other instances
        outs = outs.replace(abbr_clean, expanded)
        if abbr_clean[-1]=='s':
            outs = outs.replace(abbr_clean[:-1], expanded)
    return outs


## Get doc-per-line file

In [469]:
def clean_line_generator():
    ABBR_LIST = defaultdict(list)
    for abst in iter(data_df.content):
        abst1 = remove_stopwords(abst)
        outs = abbr_expander(abst1)
        outs = re.sub(r'[\(\)]', '', outs) # shifted both of them here out of abbr_expander() because it wasn't working there for some reason
        outs = re.sub(r'\s{2,}', ' ', outs)
        f.write(outs+'\n')
    
with open('one-abstract-per-line.txt','w') as f:
    clean_line_generator()
    