In [1]:
#adapted from http://mlbernauer.com/R/20160131-document-retrieval-sklearn.html

In [43]:
import os
import pandas as pd
import sklearn
import numpy as np
import nltk
import nltk.data

import re
import time
import codecs
from Bio import Medline

import gensim
from gensim import corpora, models, similarities
from collections import defaultdict

In [16]:
os.getcwd()

'/Users/ptighe/Documents/Python Projects/AcutePainOntology/medline_testing'

In [62]:
path='pubmed_result_medline.txt'

In [63]:
#Taken directly from reference:

In [64]:
#Dictionary of medline terms: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
def read_medline_data(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["pmid", "articletitle", "journaltitle", "authors", "affiliation", "grant",
                                   "abstract", "pubdate"])
    count = 0
    for rec in recs:
        try:
            abstr = rec["AB"]
            atitle = rec["TI"]
            auths = rec["AU"]
            pubdate = rec["DP"]
            jtitle = rec["JT"]
            grant=rec["GR"]
            pmid = rec['PMID']
            affil = rec['AD']
            text = text.append(pd.DataFrame([[pmid,atitle, jtitle, auths, affil,grant, abstr,pubdate]],
                                     columns=["pmid", "articletitle", "journaltitle", "authors", "affiliation", 
                                              "grant", "abstract", "pubdate"]),ignore_index=True)            
        except:
            pass
    return text

In [65]:
# Read in MEDLINE formatted text
papers = read_medline_data(path)

In [70]:
papers.loc[0,'abstract']

'BACKGROUND: Consensus indicates that a comprehensive,multimodal, holistic approach is foundational to the practice of acute pain medicine (APM),but lack of uniform, evidence-based clinical pathways leads to undesirable variability throughout U. S. healthcare systems. Acute pain studies are inconsistently synthesized to guide educational programs. Advanced practice techniques involving regional anesthesia assume the presence of a physician-led, multidisciplinary acute pain service,which is often unavailable or inconsistently applied.This heterogeneity of educational and organizational standards may result in unnecessary patient pain and escalation of healthcare costs. METHODS: A multidisciplinary panel was nominated through the APM Shared Interest Group of the American Academy of Pain Medicine. The panel met in Chicago, IL, in July 2014, to identify gaps and set priorities in APM research and education. RESULTS: The panel identified three areas of critical need: 1) an open-source acute

In [74]:
#Now some items for cleaning the abstract text
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
subheadings =["AIMS:", "METHODS AND RESULTS:", "CONCLUSIONS:", "RESULTS:", "METHODS:", "INTRODUCTION:", "BACKGROUND:"]
stoplist.extend(['\x0c', '\n'])
stoplist.extend(subheadings)
stoplist

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all

In [75]:
#http://mlbernauer.com/R/20160131-document-retrieval-sklearn.html
#But this doesn't seem to work??
def clean_text(text):
    for i in stoplist:
        text.replace(i, ' ')
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower()

In [76]:
#http://mlbernauer.com/R/20160131-document-retrieval-sklearn.html
#Not sure, still see BACKGROUND etc
papers['clean_abstract'] = papers['abstract'].apply(clean_text)
papers['clean_articletitle'] = papers['articletitle'].apply(clean_text)
papers.head(1)

Unnamed: 0,pmid,articletitle,journaltitle,authors,affiliation,grant,abstract,pubdate,clean_abstract,clean_articletitle
0,26535424,Acute Pain Medicine in the United States: A St...,"Pain medicine (Malden, Mass.)","[Tighe P, Buckenmaier CC 3rd, Boezaart AP, Car...","Department of Anesthesiology, University of Fl...","[K23 GM 102697/GM/NIGMS NIH HHS/United States,...",BACKGROUND: Consensus indicates that a compreh...,2015 Sep,background consensus indicates that a comprehe...,acute pain medicine in the united states a sta...


In [None]:
def pull_sentences(filename):
    """
        Breaks abstract into sentences
        """

    print "\nTokenizing abstract\n\n"
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(filename)
    data = fp.read()
    return tokenizer.tokenize(data.decode('utf-8'))


In [None]:
def create_word2vec_sentence(sentence_list, stoplist):
    print "\nCreating word2vec sentences\n\n"
    retList = list()
    toolbar_width = len(sentence_list)
    for i, sentence in enumerate(sentence_list):

        p = str((float(i+1)/toolbar_width)*100)[:4]
        sys.stdout.write("\r%s%%" %p)
        sys.stdout.flush()

        sentLst = [x.strip(",").rstrip(".").strip(":").lower() for x in sentence.split(" ") if x.strip(",").strip(".").strip(":").lower() not in stoplist]
        retList.append(sentLst)
    return retList