In [1]:
#adapted from http://mlbernauer.com/R/20160131-document-retrieval-sklearn.html

In [1]:
import os
import pandas as pd
import sklearn
import numpy as np
import nltk
import nltk.data

import re
import time
import codecs
from Bio import Medline

import gensim
from gensim import corpora, models, similarities
from collections import defaultdict

In [2]:
os.getcwd()

'/Users/ptighe/Documents/Python Projects/AcutePainOntology/medline_testing'

In [3]:
path='pubmed_result_medline.txt'

In [4]:
#Taken directly from reference:

In [5]:
#Dictionary of medline terms: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
def read_medline_data(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["pmid", "articletitle", "journaltitle", "authors", "affiliation", "grant",
                                   "abstract", "pubdate"])
    count = 0
    for rec in recs:
        try:
            abstr = rec["AB"]
            atitle = rec["TI"]
            auths = rec["AU"]
            pubdate = rec["DP"]
            jtitle = rec["JT"]
            grant=rec["GR"]
            pmid = rec['PMID']
            affil = rec['AD']
            text = text.append(pd.DataFrame([[pmid,atitle, jtitle, auths, affil,grant, abstr,pubdate]],
                                     columns=["pmid", "articletitle", "journaltitle", "authors", "affiliation", 
                                              "grant", "abstract", "pubdate"]),ignore_index=True)            
        except:
            pass
    return text

In [6]:
# Read in MEDLINE formatted text
papers = read_medline_data(path)

In [7]:
papers.loc[0,'abstract']

'BACKGROUND: Consensus indicates that a comprehensive,multimodal, holistic approach is foundational to the practice of acute pain medicine (APM),but lack of uniform, evidence-based clinical pathways leads to undesirable variability throughout U. S. healthcare systems. Acute pain studies are inconsistently synthesized to guide educational programs. Advanced practice techniques involving regional anesthesia assume the presence of a physician-led, multidisciplinary acute pain service,which is often unavailable or inconsistently applied.This heterogeneity of educational and organizational standards may result in unnecessary patient pain and escalation of healthcare costs. METHODS: A multidisciplinary panel was nominated through the APM Shared Interest Group of the American Academy of Pain Medicine. The panel met in Chicago, IL, in July 2014, to identify gaps and set priorities in APM research and education. RESULTS: The panel identified three areas of critical need: 1) an open-source acute

In [8]:
#Now some items for cleaning the abstract text
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
subheadings =[u"OBJECTIVES", u"OBJECTIVE", u"AIMS", 
              u"METHODS AND RESULTS", u"CONCLUSIONS", u"RESULTS", u"METHODS", u"INTRODUCTION", u"BACKGROUND"]
stoplist.extend(['\x0c', '\n'])
stoplist.extend(subheadings)

In [9]:
papers['split_abstract'] = papers['abstract'].str.split()

In [10]:
papers['split_abstract']

0     [BACKGROUND:, Consensus, indicates, that, a, c...
1     [Older, adults, are, at, an, increased, risk, ...
2     [OBJECTIVES:, Prior, work, on, postoperative, ...
3     [BACKGROUND:, Given, their, ability, to, proce...
4     [BACKGROUND:, Despite, the, widespread, popula...
5     [The, objective, of, this, study, was, to, det...
6     [OBJECTIVE/BACKGROUND:, Prior, work, has, not,...
7     [Although, prior, work, has, investigated, the...
8     [The, American, Academy, of, Pain, Medicine, a...
9     [OBJECTIVE:, The, goal, of, this, project, was...
10    [BACKGROUND:, In, recent, years,, the, field, ...
11    [OBJECTIVE:, The, purpose, of, this, project, ...
12    [INTRODUCTION:, Although, more, than, 30, mill...
Name: split_abstract, dtype: object

In [47]:
from nltk.tokenize import MWETokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

papers['ta']= papers['abstract'].apply(lambda x: word_tokenize(x))

In [48]:
papers['ta']

0     [BACKGROUND, :, Consensus, indicates, that, a,...
1     [Older, adults, are, at, an, increased, risk, ...
2     [OBJECTIVES, :, Prior, work, on, postoperative...
3     [BACKGROUND, :, Given, their, ability, to, pro...
4     [BACKGROUND, :, Despite, the, widespread, popu...
5     [The, objective, of, this, study, was, to, det...
6     [OBJECTIVE/BACKGROUND, :, Prior, work, has, no...
7     [Although, prior, work, has, investigated, the...
8     [The, American, Academy, of, Pain, Medicine, a...
9     [OBJECTIVE, :, The, goal, of, this, project, w...
10    [BACKGROUND, :, In, recent, years, ,, the, fie...
11    [OBJECTIVE, :, The, purpose, of, this, project...
12    [INTRODUCTION, :, Although, more, than, 30, mi...
Name: ta, dtype: object

In [None]:
def pull_sentences(filename):
    """
        Breaks abstract into sentences
        """

    print "\nTokenizing abstract\n\n"
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(filename)
    data = fp.read()
    return tokenizer.tokenize(data.decode('utf-8'))


In [None]:
def create_word2vec_sentence(sentence_list, stoplist):
    print "\nCreating word2vec sentences\n\n"
    retList = list()
    toolbar_width = len(sentence_list)
    for i, sentence in enumerate(sentence_list):

        p = str((float(i+1)/toolbar_width)*100)[:4]
        sys.stdout.write("\r%s%%" %p)
        sys.stdout.flush()

        sentLst = [x.strip(",").rstrip(".").strip(":").lower() for x in sentence.split(" ") if x.strip(",").strip(".").strip(":").lower() not in stoplist]
        retList.append(sentLst)
    return retList

In [15]:
papers['cleaned_abstract'].concordance('pain')

KeyError: 'cleaned_abstract'

In [16]:
from collections import Counter

In [18]:
wc = Counter(" ".join(papers['abstract'].values.tolist()).split(" ")).items()
wc

dict_items([('mixed', 1), ('data.', 1), ('decision', 3), ('emotion', 1), ('hour', 6), ('correlation', 4), ('1.16', 1), ('adult', 4), ('POD', 6), ('receiver', 2), ('rating', 4), ('identified', 2), ('review,', 1), ('node', 1), ('hospital.', 1), ('considered', 1), ('containment,', 1), ('foundational', 1), ('examine', 3), ('gradient-boosted', 2), ('inpatient', 1), ('accuracies', 1), ('how', 2), ('male', 3), ('6.5', 1), ('algorithm', 1), ('efficiency', 2), ('conjunction', 1), ('pain-related', 5), ('age,', 3), ('nonambulatory', 2), ('scores.', 4), ('purpose', 1), ('h)', 2), ('(TNF-alpha', 1), ('extend', 1), ('comparisons', 1), ('METHODS:', 5), ('what', 1), ('baseline', 2), ('recent', 1), ('clinical', 10), ('step', 1), ('debilitating', 1), ('evaluated,', 1), ('care', 2), ('Assessment', 1), ('geographic,', 1), ('SPE,', 2), ('these', 3), ('43,806', 1), ('LASSO', 2), ('set', 4), ('response', 1), ('communications', 1), ('five', 1), ('38%', 1), ('requests', 1), ('diseases.', 1), ('Our', 1), ('work

In [50]:
#Count words
papers.abstract.apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0)

"Always"           1
"pain"             1
(0.0018            1
(0.02              1
(0.14),            1
(0.25).            1
(0.26),            1
(0.4)              1
(0.43),            1
(0.45),            1
(0.87,             1
(2,                1
(22.45%)           1
(25.09%)           1
(8                 1
(91,708)           1
(95%               5
(99%               2
(</=4/10).         1
(APM)              1
(APM),but          1
(APS)              1
(CPT),             1
(FHP),             1
(HCAHPS)           1
(IL-4,             1
(IQR,              1
(LASSO),           1
(NRS)              1
(OR,               3
                  ..
versus             1
vexing             1
vital              1
vs                 2
warrants           1
was               22
we                 8
weighted.          1
well               7
well-validated     1
were              21
what               1
when               3
where              1
whether            1
which              5
widespread   

In [52]:
papers.ta.apply(lambda x: pd.value_counts(x).sum(axis=0))

0     277
1     223
2     285
3     263
4     319
5     297
6     292
7     228
8     150
9     305
10    195
11    286
12    394
Name: ta, dtype: int64