In [1]:
import Bio
import os

In [2]:
from Bio import Entrez
from bs4 import BeautifulSoup

In [3]:
def fetch_pubmed_abstracts(pmids):
    Entrez.email = "Your.Name.Here@example.org"
    handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract")
    return handle.read()

In [40]:
data = './data'
gold_set_unlabelled = os.path.join(data, 'pmids_test_set_unlabeled.txt')

In [41]:
lines = open(gold_set_unlabelled, 'r').readlines()

In [42]:
pmids = ",".join([line.strip() for line in lines])

In [43]:
# pmids = '28618929'
output = fetch_pubmed_abstracts(pmids)

In [44]:
soup = BeautifulSoup(output, 'xml')

In [45]:
pmids_ = []
titles_ = []
abstracts_ = []

for i, record in enumerate(soup.find_all('PubmedArticle')):
    try:
        pmid = record.MedlineCitation.PMID.text
    except AttributeError:
        pmid = None
    try:
        title = record.MedlineCitation.Article.ArticleTitle.text
    except AttributeError:
        title = ''
    try:
        abstract = record.MedlineCitation.Article.Abstract.text
    except AttributeError:
        abstract = ''
    pmids_.append(pmid)
    titles_.append(title)
    abstracts_.append(abstract)

In [46]:
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser

In [47]:
from metamap_fetch import metamap_fetch
from metamap_words import metamap_matched_words

In [48]:
disease_dictionary = []
metamap_json_list = []
for i, (title, abstract) in enumerate(zip(titles_, abstracts_)):
    mm_json = metamap_fetch(title + ' ' + abstract)
    metamap_json_list.append(mm_json)
    disease_dictionary.append(metamap_matched_words(mm_json))
    if i % 10 == 0:
        print(i)

0
10
20
30


In [49]:
import string, json

def metamap_matched_words_local(output):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    
    words_list = []
    
    try:
        mm_op_json = json.loads(output)
    except ValueError:
        return text

    for doc in mm_op_json["AllDocuments"]:
        for utts_num, utts in enumerate(doc["Document"]["Utterances"]):
            for phr_num, phr in enumerate(utts["Phrases"]):
                if phr["Mappings"]:
                    for mappings in phr["Mappings"]:
                        for mapping in mappings["MappingCandidates"]:
                            '''
                            words = []
                            for word in mapping["CandidatePreferred"].lower().translate(translator).split():
                                if len(word) > 2:
                                    words.append(word)
                            words_list.append('_'.join(words))
                            '''
                            for word in mapping["CandidatePreferred"].lower().translate(translator).split():
                                if len(word) > 2:
                                    words_list.append(word.lower().translate(translator))
                            
    return words_list

disease_dictionary = []

for mm_json in metamap_json_list:
    disease_dictionary.append(metamap_matched_words_local(mm_json))

In [50]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [51]:
dct = Dictionary(disease_dictionary)

dct.filter_extremes(no_below=3, no_above=1.0)

dct.compactify()

In [52]:
corpus = [dct.doc2bow(line) for line in disease_dictionary]

model = TfidfModel(corpus)

In [53]:
len(dct)

36

In [54]:
import numpy as np

In [55]:
document_matrix = np.zeros((len(disease_dictionary), len(dct)))

In [56]:
for i, disease_words in enumerate(corpus):
    disease_tfidf = model[disease_words]
    for (dct_id, score) in disease_tfidf:
        document_matrix[i][dct_id] = score

In [57]:
from sklearn.cluster import AffinityPropagation

In [58]:
kmeans = AffinityPropagation(damping=0.51, max_iter=500).fit(document_matrix)

In [59]:
kmeans.labels_

array([3, 0, 1, 0, 1, 0, 0, 3, 2, 2, 3, 7, 3, 2, 3, 7, 4, 4, 2, 7, 7, 3,
       3, 7, 7, 7, 0, 6, 5, 6, 1, 2, 7, 7, 4, 4])

In [60]:
for j in range(0, 15):
    print('----------')
    for i, label in enumerate(kmeans.labels_ == j):
        if label:
            print(disease_dictionary[i])

----------
['motor', 'neuron', 'disease', 'cardiac', 'arrest', 'disease', 'malignant', 'neoplasms', 'primary', 'malignant', 'neoplasm', 'metabolic', 'diseases', 'nervous', 'system', 'disorder', 'nervous', 'system', 'disorder', 'alzheimer', 'disease', 'amyotrophic', 'lateral', 'sclerosis', 'spinal', 'muscular', 'atrophy', 'motor', 'neuron', 'disease', 'disease', 'spinal', 'muscular', 'atrophy', 'amyotrophic', 'lateral', 'sclerosis']
['motor', 'neuron', 'disease', 'disease', 'heredodegenerative', 'disorders', 'nervous', 'system', 'pure', 'hereditary', 'spastic', 'paraplegia', 'spinal', 'muscular', 'atrophy', 'amyotrophic', 'lateral', 'sclerosis', 'motor', 'neuron', 'disease', 'lower', 'allogenic', 'disease', 'spinal', 'muscular', 'atrophy', 'motor', 'neuron', 'disease', 'upper', 'motor', 'neuron', 'disease', 'upper']
['motor', 'neuron', 'disease', 'disease', 'malnutrition', 'neurodegenerative', 'disorders', 'spinal', 'muscular', 'atrophy', 'neuro', 'degenerative', 'disease', 'spinal', 'm

In [61]:
import tempfile

In [62]:
in_file = tempfile.NamedTemporaryFile(mode="wb", delete=False)

In [63]:
in_file.name

'/var/folders/jn/v5lkcznn19n7fcj98z5jrll00000gn/T/tmp461koggh'

In [64]:
in_file.delete

False

In [65]:
in_file.close()

In [69]:
os.path.exists(in_file.name)

False

In [68]:
os.remove(in_file.name)