In [1]:
import Bio
import os

In [2]:
from Bio import Entrez
from bs4 import BeautifulSoup

In [3]:
def fetch_pubmed_abstracts(pmids):
    Entrez.email = "Your.Name.Here@example.org"
    handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract")
    return handle.read()

In [4]:
data = './data'
gold_set_unlabelled = os.path.join(data, 'pmids_gold_set_unlabeled.txt')

In [5]:
lines = open(gold_set_unlabelled, 'r').readlines()

In [6]:
pmids = ",".join([line.strip() for line in lines])

In [7]:
# pmids = '28618929'
output = fetch_pubmed_abstracts(pmids)

In [8]:
soup = BeautifulSoup(output, 'xml')

In [9]:
pmids_ = []
titles_ = []
abstracts_ = []

for i, record in enumerate(soup.find_all('PubmedArticle')):
    try:
        pmid = record.MedlineCitation.PMID.text
    except AttributeError:
        pmid = None
    try:
        title = record.MedlineCitation.Article.ArticleTitle.text
    except AttributeError:
        title = ''
    try:
        abstract = record.MedlineCitation.Article.Abstract.AbstractText.text
    except AttributeError:
        abstract = ''
    pmids_.append(pmid)
    titles_.append(title)
    abstracts_.append(abstract)

In [None]:
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser

In [10]:
from metamap_fetch import metamap_fetch
from metamap_words import metamap_matched_words

In [None]:
disease_dictionary = []

for i, (title, abstract) in enumerate(zip(titles_, abstracts_)):
    mm_json = metamap_fetch(title + ' ' + abstract)
    disease_dictionary.append(metamap_matched_words(mm_json))
    print(i)
    if i == 5:
        break

0
1
2


In [13]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [14]:
dct = Dictionary(disease_dictionary)

corpus = [dct.doc2bow(line) for line in disease_dictionary]

model = TfidfModel(corpus)

In [15]:
len(dct)

156

In [16]:
import numpy as np

In [17]:
document_matrix = np.zeros((len(disease_dictionary), len(dct)))

In [18]:
for i, disease_words in enumerate(corpus):
    disease_tfidf = model[disease_words]
    for (dct_id, score) in disease_tfidf:
        document_matrix[i][dct_id] = score

In [19]:
from sklearn.cluster import AffinityPropagation

In [41]:
kmeans = AffinityPropagation(damping=0.51, max_iter=500).fit(document_matrix)

In [42]:
kmeans.labels_

array([1, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 3, 3, 3,
       3, 0, 3, 2, 3, 1, 3, 3, 3, 2, 2, 0, 2, 2, 2, 0, 2, 3, 2, 3, 2, 2,
       3, 0, 0, 3, 2, 2, 2, 0, 2, 1, 3, 3, 2, 3, 2, 2, 3, 3, 0, 1, 3, 2,
       2, 2, 2])

In [43]:
for i, label in enumerate(kmeans.labels_ == 3):
    if label:
        print(disease_dictionary[i])

['bardet', 'biedl', 'syndrome', 'bardet', 'biedl', 'syndrome', 'obesity', 'pigmentary', 'retinopathy', 'hypogonadism', 'laurence', 'moon', 'syndrome', 'retinopathy', 'syndrome', 'end', 'stage', 'renal', 'disease', 'end', 'stage', 'renal', 'disease']
['laurence', 'moon', 'bardet', 'biedl', 'syndrome', 'syndrome', 'retinal', 'dystrophy', 'obesity', 'retinal', 'dystrophy', 'syndrome']
['bardet', 'biedl', 'syndrome', 'autonomic', 'dysfunction', 'hypotensive', 'episode', 'end', 'stage', 'renal', 'disease', 'end', 'stage', 'renal', 'disease', 'renal', 'failure']
['alzheimer', 'disease', 'disorders', 'alzheimer', 'disease', 'alzheimer', 'disease', 'alzheimer', 'disease']
['dlb']
['alzheimer', 'disease']
['frontotemporal', 'dementia', 'frontotemporal', 'dementia', 'alzheimer', 'disease', 'alzheimer', 'disease', 'frontotemporal', 'dementia', 'frontotemporal', 'dementia', 'alzheimer', 'disease', 'frontotemporal', 'dementia', 'frontotemporal', 'dementia', 'disease', 'frontotemporal', 'dementia', 

In [None]:
gmm.predict(document_matrix)

In [None]:
for i, label in enumerate(kmeans.labels_ == 1):
    if label:
        print(disease_dictionary[i])

In [None]:
kmeans.get_params()

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=5).fit(document_matrix)

In [None]:
from sklearn.cluster import Birch

In [None]:
brch = Birch(n_clusters=None).fit(document_matrix)
brch.predict(document_matrix)

In [None]:
from sklearn.cluster import MeanShift
from sklearn.mixture import BayesianGaussianMixture

In [None]:
mean_shift = MeanShift(bandwidth=20).fit(document_matrix)

In [None]:
mean_shift.predict(document_matrix)

In [None]:
bgmm = BayesianGaussianMixture(n_components=50).fit(document_matrix)

In [None]:
bgmm.predict(document_matrix)