In [35]:
#adapted from http://mlbernauer.com/R/20160131-document-retrieval-sklearn.html

In [36]:
import os
import pandas as pd
import sklearn
import numpy as np

import nltk
import nltk.data
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import re
import string
import time
import codecs
from Bio import Medline

import gensim
from gensim import corpora, models, similarities
from collections import defaultdict

In [37]:
os.getcwd()

'/Users/ptighe/Documents/Python Projects/AcutePainOntology/medline_testing'

In [38]:
path='pubmed_result_medline.txt'

In [39]:
#Taken directly from reference:

In [40]:
#Dictionary of medline terms: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
def read_medline_data(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["pmid", "articletitle", "journaltitle", "authors", "affiliation", "grant",
                                   "abstract", "pubdate"])
    count = 0
    for rec in recs:
        try:
            abstr = rec["AB"]
            atitle = rec["TI"]
            auths = rec["AU"]
            pubdate = rec["DP"]
            jtitle = rec["JT"]
            grant=rec["GR"]
            pmid = rec['PMID']
            affil = rec['AD']
            text = text.append(pd.DataFrame([[pmid,atitle, jtitle, auths, affil,grant, abstr,pubdate]],
                                     columns=["pmid", "articletitle", "journaltitle", "authors", "affiliation", 
                                              "grant", "abstract", "pubdate"]),ignore_index=True)            
        except:
            pass
    return text

In [41]:
# Read in MEDLINE formatted text
papers = read_medline_data(path)

In [42]:
papers.loc[0,'abstract']

'BACKGROUND: Consensus indicates that a comprehensive,multimodal, holistic approach is foundational to the practice of acute pain medicine (APM),but lack of uniform, evidence-based clinical pathways leads to undesirable variability throughout U. S. healthcare systems. Acute pain studies are inconsistently synthesized to guide educational programs. Advanced practice techniques involving regional anesthesia assume the presence of a physician-led, multidisciplinary acute pain service,which is often unavailable or inconsistently applied.This heterogeneity of educational and organizational standards may result in unnecessary patient pain and escalation of healthcare costs. METHODS: A multidisciplinary panel was nominated through the APM Shared Interest Group of the American Academy of Pain Medicine. The panel met in Chicago, IL, in July 2014, to identify gaps and set priorities in APM research and education. RESULTS: The panel identified three areas of critical need: 1) an open-source acute

In [43]:
#Need to find a list of subheadings to remove?

shp = re.compile(r'^[A-Z\d]+$')

subheadings =[u"OBJECTIVES:", u"OBJECTIVE:", u"AIMS:", u"OBJECTIVE/BACKGROUND:",
              u"METHODS AND RESULTS:", u"CONCLUSIONS:", u"RESULTS:", u"METHODS:", u"INTRODUCTION:", u"BACKGROUND:"]

In [44]:
#Now some items for cleaning the abstract text
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
stoplist.extend(['\x0c', '\n'])
subheadings =[u"OBJECTIVES:", u"OBJECTIVE:", u"AIMS:", u"OBJECTIVE/BACKGROUND:",
              u"METHODS AND RESULTS:", u"CONCLUSIONS:", u"RESULTS:", u"METHODS:", u"INTRODUCTION:", u"BACKGROUND:"]

In [45]:
papers['split_abstract'] = papers['abstract'].str.split()
papers['split_abstract'] = papers['split_abstract'].apply(lambda x: [item for item in x if item not in subheadings])
papers['split_abstract'] = papers['split_abstract'].apply(lambda x: ' '.join(x).lower().split())
papers['split_abstract'] = papers['split_abstract'].apply(lambda x: [item for item in x if item not in stoplist])

In [46]:
papers['cleaned_abstract']=papers['split_abstract'].apply(lambda x: ' '.join(x))
papers['cleaned_abstract']=papers['cleaned_abstract'].str.replace("[^a-zA-Z]"," ").str.replace("\s+"," ").str.strip()

In [47]:
papers['cleaned_abstract']

0     consensus indicates comprehensive multimodal h...
1     older adults increased risk develop frequent p...
2     prior work postoperative pain trajectories exa...
3     given ability process highly dimensional datas...
4     despite widespread popularity social media lit...
5     objective study determine effects age sex type...
6     prior work addressed sex differences incidence...
7     although prior work investigated interplay dem...
8     american academy pain medicine american societ...
9     goal project explore association post anesthes...
10    recent years field acute pain medicine apm wit...
11    purpose project determine whether machine lear...
12    although million patients united states underg...
Name: cleaned_abstract, dtype: object

In [87]:
#Can try this later
stemmer = nltk.SnowballStemmer("english")
lemmer = nltk.WordNetLemmatizer()
papers['stemab_p']= papers['cleaned_abstract'].str.split()
papers['stemab']= papers['cleaned_abstract'].map(lambda x: lemmer.lemmatize(x))

In [88]:
papers['stemab']

0     consensus indicates comprehensive multimodal h...
1     older adults increased risk develop frequent p...
2     prior work postoperative pain trajectories exa...
3     given ability process highly dimensional datas...
4     despite widespread popularity social media lit...
5     objective study determine effects age sex type...
6     prior work addressed sex differences incidence...
7     although prior work investigated interplay dem...
8     american academy pain medicine american societ...
9     goal project explore association post anesthes...
10    recent years field acute pain medicine apm wit...
11    purpose project determine whether machine lear...
12    although million patients united states underg...
Name: stemab, dtype: object

In [233]:
papers['split_abstract']

0     [consensus, indicates, comprehensive,multimoda...
1     [older, adults, increased, risk, develop, freq...
2     [prior, work, postoperative, pain, trajectorie...
3     [given, ability, process, highly, dimensional,...
4     [despite, widespread, popularity, social, medi...
5     [objective, study, determine, effects, age,, s...
6     [prior, work, addressed, sex, differences, inc...
7     [although, prior, work, investigated, interpla...
8     [american, academy, pain, medicine, american, ...
9     [goal, project, explore, association, post-ane...
10    [recent, years,, field, acute, pain, medicine,...
11    [purpose, project, determine, whether, machine...
12    [although, 30, million, patients, united, stat...
Name: split_abstract, dtype: object

In [218]:
papers['cleaned_abstract']

0     consensus indicates comprehensive multimodal h...
1     older adults increased risk develop frequent p...
2     prior work postoperative pain trajectories exa...
3     given ability process highly dimensional datas...
4     despite widespread popularity social media lit...
5     objective study determine effects age sex type...
6     prior work addressed sex differences incidence...
7     although prior work investigated interplay dem...
8     american academy pain medicine american societ...
9     goal project explore association post anesthes...
10    recent years field acute pain medicine apm wit...
11    purpose project determine whether machine lear...
12    although million patients united states underg...
Name: cleaned_abstract, dtype: object

In [197]:
papers['split_articletitle'] = papers['articletitle'].str.split()
papers['split_articletitle'] = papers['split_articletitle'].apply(lambda x: ' '.join(x).lower().split())
papers['split_articletitle'] = papers['split_articletitle'].apply(lambda x: [item for item in x if item not in stoplist])
papers['cleaned_articletitle']=papers['split_articletitle'].apply(lambda x: ' '.join(x))
papers['cleaned_articletitle']=papers['cleaned_articletitle'].str.replace("[^a-zA-Z]"," ").str.replace("\s\s+"," ")

In [203]:
papers['cleaned_articletitle']

0      acute pain medicine united states status report 
1     age differences cytokine expression conditions...
2     time onset sustained postoperative pain relief...
3     teaching machine feel postoperative pain combi...
4     painful tweet text sentiment community structu...
5     clinically derived early postoperative pain tr...
6     sex differences incidence severe pain events f...
7     geospatial analysis hospital consumer assessme...
8                   acute pain medicine anesthesiology 
9     rough starts smooth finishes correlations post...
10              evolution practice acute pain medicine 
11    use machine learning classifiers predict reque...
12    primary payer status associated use nerve bloc...
Name: cleaned_articletitle, dtype: object

In [None]:
def pull_sentences(filename):
    """
        Breaks abstract into sentences
        """

    print "\nTokenizing abstract\n\n"
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(filename)
    data = fp.read()
    return tokenizer.tokenize(data.decode('utf-8'))


In [None]:
def create_word2vec_sentence(sentence_list, stoplist):
    print "\nCreating word2vec sentences\n\n"
    retList = list()
    toolbar_width = len(sentence_list)
    for i, sentence in enumerate(sentence_list):

        p = str((float(i+1)/toolbar_width)*100)[:4]
        sys.stdout.write("\r%s%%" %p)
        sys.stdout.flush()

        sentLst = [x.strip(",").rstrip(".").strip(":").lower() for x in sentence.split(" ") if x.strip(",").strip(".").strip(":").lower() not in stoplist]
        retList.append(sentLst)
    return retList