In [1]:
#adapted from http://mlbernauer.com/R/20160131-document-retrieval-sklearn.html

In [10]:
import os
import pandas as pd
import sklearn
import numpy as np
import nltk
import nltk.data

import re
import time
import codecs
from Bio import Medline

import gensim
from gensim import corpora, models, similarities
from collections import defaultdict

In [11]:
os.getcwd()

'/Users/ptighe/Documents/Python Projects/AcutePainOntology/medline_testing'

In [12]:
path='pubmed_result_medline.txt'

In [13]:
#Taken directly from reference:

In [14]:
#Dictionary of medline terms: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
def read_medline_data(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["pmid", "articletitle", "journaltitle", "authors", "affiliation", "grant",
                                   "abstract", "pubdate"])
    count = 0
    for rec in recs:
        try:
            abstr = rec["AB"]
            atitle = rec["TI"]
            auths = rec["AU"]
            pubdate = rec["DP"]
            jtitle = rec["JT"]
            grant=rec["GR"]
            pmid = rec['PMID']
            affil = rec['AD']
            text = text.append(pd.DataFrame([[pmid,atitle, jtitle, auths, affil,grant, abstr,pubdate]],
                                     columns=["pmid", "articletitle", "journaltitle", "authors", "affiliation", 
                                              "grant", "abstract", "pubdate"]),ignore_index=True)            
        except:
            pass
    return text

In [222]:
# Read in MEDLINE formatted text
papers = read_medline_data(path)

In [223]:
papers.loc[0,'abstract']

'BACKGROUND: Consensus indicates that a comprehensive,multimodal, holistic approach is foundational to the practice of acute pain medicine (APM),but lack of uniform, evidence-based clinical pathways leads to undesirable variability throughout U. S. healthcare systems. Acute pain studies are inconsistently synthesized to guide educational programs. Advanced practice techniques involving regional anesthesia assume the presence of a physician-led, multidisciplinary acute pain service,which is often unavailable or inconsistently applied.This heterogeneity of educational and organizational standards may result in unnecessary patient pain and escalation of healthcare costs. METHODS: A multidisciplinary panel was nominated through the APM Shared Interest Group of the American Academy of Pain Medicine. The panel met in Chicago, IL, in July 2014, to identify gaps and set priorities in APM research and education. RESULTS: The panel identified three areas of critical need: 1) an open-source acute

In [224]:
#Now some items for cleaning the abstract text
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
subheadings =[u"OBJECTIVES", u"OBJECTIVE", u"AIMS", 
              u"METHODS AND RESULTS", u"CONCLUSIONS", u"RESULTS", u"METHODS", u"INTRODUCTION", u"BACKGROUND"]
stoplist.extend(['\x0c', '\n'])
stoplist.extend(subheadings)

In [225]:
papers['split_abstract'] = papers['abstract'].str.split()

In [226]:
papers['split_abstract']

0     [BACKGROUND:, Consensus, indicates, that, a, c...
1     [Older, adults, are, at, an, increased, risk, ...
2     [OBJECTIVES:, Prior, work, on, postoperative, ...
3     [BACKGROUND:, Given, their, ability, to, proce...
4     [BACKGROUND:, Despite, the, widespread, popula...
5     [The, objective, of, this, study, was, to, det...
6     [OBJECTIVE/BACKGROUND:, Prior, work, has, not,...
7     [Although, prior, work, has, investigated, the...
8     [The, American, Academy, of, Pain, Medicine, a...
9     [OBJECTIVE:, The, goal, of, this, project, was...
10    [BACKGROUND:, In, recent, years,, the, field, ...
11    [OBJECTIVE:, The, purpose, of, this, project, ...
12    [INTRODUCTION:, Although, more, than, 30, mill...
Name: split_abstract, dtype: object

In [229]:
papers['ca'] = papers['split_abstract'].apply(lambda x: [item for item in x if item not in stoplist])

In [230]:
papers['ca']

0     [BACKGROUND:, Consensus, indicates, comprehens...
1     [Older, adults, increased, risk, develop, freq...
2     [OBJECTIVES:, Prior, work, postoperative, pain...
3     [BACKGROUND:, Given, ability, process, highly,...
4     [BACKGROUND:, Despite, widespread, popularity,...
5     [The, objective, study, determine, effects, ag...
6     [OBJECTIVE/BACKGROUND:, Prior, work, addressed...
7     [Although, prior, work, investigated, interpla...
8     [The, American, Academy, Pain, Medicine, Ameri...
9     [OBJECTIVE:, The, goal, project, explore, asso...
10    [BACKGROUND:, In, recent, years,, field, acute...
11    [OBJECTIVE:, The, purpose, project, determine,...
12    [INTRODUCTION:, Although, 30, million, patient...
Name: ca, dtype: object

In [None]:
def cleanup:(text, stopwordlist):
        words = text.split()
        mw = [w for w in words if not w in stopwordlist] #remove unwanted words

In [209]:
for word in stoplist: papers['cleaned_abstract']= papers['abstract'].str.replace(word,'')
# for word in stoplist: papers['cleaned_articletitle']= papers['articletitle'].str.replace(word,'')
   

In [210]:
papers['cleaned_abstract']

0     : Consensus indicates that a comprehensive,mul...
1     Older adults are at an increased risk to devel...
2     OBJECTIVES: Prior work on postoperative pain t...
3     : Given their ability to process highly dimens...
4     : Despite the widespread popularity of social ...
5     The objective of this study was to determine t...
6     OBJECTIVE/: Prior work has not addressed sex d...
7     Although prior work has investigated the inter...
8     The American Academy of Pain Medicine and the ...
9     OBJECTIVE: The goal of this project was to exp...
10    : In recent years, the field of acute pain med...
11    OBJECTIVE: The purpose of this project was to ...
12    INTRODUCTION: Although more than 30 million pa...
Name: cleaned_abstract, dtype: object

In [185]:
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation]) 
papers['cleaned_abstract'] = papers['cleaned_abstract'].str.replace(RE_PUNCTUATION," ").str.lower()
papers['cleaned_abstract'] = papers['cleaned_abstract'].str.replace("\s+"," ").str.strip()
papers['cleaned_articletitle'] = papers['cleaned_articletitle'].str.replace(RE_PUNCTUATION," ").str.lower()

In [190]:
# papers.head(1)
papers['cleaned_abstract']

0     [BACKGROUND:, Consensus, indicates, that, a, c...
1     [Older, adults, are, at, an, increased, risk, ...
2     [OBJECTIVES:, Prior, work, on, postoperative, ...
3     [BACKGROUND:, Given, their, ability, to, proce...
4     [BACKGROUND:, Despite, the, widespread, popula...
5     [The, objective, of, this, study, was, to, det...
6     [OBJECTIVE/BACKGROUND:, Prior, work, has, not,...
7     [Although, prior, work, has, investigated, the...
8     [The, American, Academy, of, Pain, Medicine, a...
9     [OBJECTIVE:, The, goal, of, this, project, was...
10    [BACKGROUND:, In, recent, years,, the, field, ...
11    [OBJECTIVE:, The, purpose, of, this, project, ...
12    [INTRODUCTION:, Although, more, than, 30, mill...
Name: cleaned_abstract, dtype: object

In [None]:
def pull_sentences(filename):
    """
        Breaks abstract into sentences
        """

    print "\nTokenizing abstract\n\n"
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(filename)
    data = fp.read()
    return tokenizer.tokenize(data.decode('utf-8'))


In [None]:
def create_word2vec_sentence(sentence_list, stoplist):
    print "\nCreating word2vec sentences\n\n"
    retList = list()
    toolbar_width = len(sentence_list)
    for i, sentence in enumerate(sentence_list):

        p = str((float(i+1)/toolbar_width)*100)[:4]
        sys.stdout.write("\r%s%%" %p)
        sys.stdout.flush()

        sentLst = [x.strip(",").rstrip(".").strip(":").lower() for x in sentence.split(" ") if x.strip(",").strip(".").strip(":").lower() not in stoplist]
        retList.append(sentLst)
    return retList