# Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import requests

In [3]:
from sklearn.cluster import AgglomerativeClustering

# Scraping

In [4]:
def get_paper_hyperlinks(query):
    url = 'https://pubmed.ncbi.nlm.nih.gov/?term=' + '+'.join(query.split()) + '&filter=simsearch1.fha&size=100'
    search_results = requests.get(url)
    page_source = BeautifulSoup(search_results.content, 'html.parser')
    papers = page_source.findAll('a', {'class': "docsum-title"})
    hyperlinks = ['https://pubmed.ncbi.nlm.nih.gov' + paper['href'] for paper in papers]
    return hyperlinks

In [5]:
def get_abstract(url):
    paper_page = requests.get(url)
    page_source = BeautifulSoup(paper_page.content, 'html.parser')
    title = page_source.find('h1', {'class': 'heading-title'}).text.strip()
    abstract = page_source.find('div', {'id': 'enc-abstract'}).text.strip()
    return [title, url, abstract]

In [6]:
def scrape_pages(query, n):
    links = get_paper_hyperlinks(query)[:n]
    abstracts = [get_abstract(link) for link in tqdm(links)]
    scraped_data = pd.DataFrame(abstracts, columns=['Title', 'URL', 'Abstract'])
    return scraped_data

In [7]:
scrape = False
query = 'neurodegenerative diseases'
n = 100

In [8]:
if scrape:
    data = scrape_pages(query, n)
    data.to_excel('abstracts.xlsx', index=False)
else:
    data = pd.read_excel('abstracts.xlsx')

In [9]:
print(data['Abstract'][0] + '\n\n' + data['Abstract'][1])

Neurodegenerative disorders are characterized by progressive loss of selectively vulnerable populations of neurons, which contrasts with select static neuronal loss because of metabolic or toxic disorders. Neurodegenerative diseases can be classified according to primary clinical features (e.g., dementia, parkinsonism, or motor neuron disease), anatomic distribution of neurodegeneration (e.g., frontotemporal degenerations, extrapyramidal disorders, or spinocerebellar degenerations), or principal molecular abnormality. The most common neurodegenerative disorders are amyloidoses, tauopathies, α-synucleinopathies, and TDP-43 proteinopathies. The protein abnormalities in these disorders have abnormal conformational properties. Growing experimental evidence suggests that abnormal protein conformers may spread from cell to cell along anatomically connected pathways, which may in part explain the specific anatomical patterns observed at autopsy. In this review, we detail the human pathology o

# Corpus Cleaning

In [10]:
def process_text(text):
    symbols = [',', '.', ':', '?', '/', ';', '[', ']', '(', ')', '&']
    symbols_rep = [' ', ' ', ' ', ' ', ' ', ' ' , '', '', '', '', '']
    for symbol, symbol_rep in zip(symbols, symbols_rep):
        text = text.replace(symbol, symbol_rep)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [11]:
def get_freq(processed_data):
    freq = {}
    for text in processed_data:
        for word in text.split():
            if (word in freq):
                freq[word] += 1
            else:
                freq[word] = 1
    freq_df = pd.DataFrame.from_dict(freq, orient='index', columns=['Count'])
    freq_df.sort_values(by='Count', ascending=False, inplace=True)
    return [freq, freq_df]

In [12]:
processed_data = [process_text(text) for text in data['Abstract']]
freq, freq_df = get_freq(processed_data)
removed_words = list(freq_df[:10].index)

In [13]:
freq_df = freq_df[10:]
for removed_word in removed_words:
    del freq[removed_word]

In [14]:
print(removed_words)

['the', 'of', 'and', 'in', 'diseases', 'neurodegenerative', 'to', 'a', 'disease', 'are']


# Abstract Vectorization

In [15]:
def compute_vector(text, freq, freq_df):
    freq_abstract = dict.fromkeys(freq, 0)
    for word in text.split():
        if (word in freq):
            freq_abstract[word] += 1
    vector = []
    for word in freq_df.index:
        vector.append(freq_abstract[word])
    return vector

In [16]:
def get_vectors(processed_data, freq, freq_df):
    vectors = []
    for text in processed_data:
        vectors.append(compute_vector(text, freq, freq_df))
    return np.array(vectors)

In [17]:
vectors = get_vectors(processed_data, freq, freq_df)

In [18]:
print(vectors[0], vectors[1])

[0 0 1 ... 0 0 0] [1 4 2 ... 0 0 0]


# Agglomerative Clustering with Cosine Distance

In [19]:
model = AgglomerativeClustering(n_clusters=6, linkage='complete', affinity='cosine')
model.fit(vectors)

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=6)

In [20]:
labels = pd.Series(model.labels_)
np.unique(labels, return_counts=True)

(array([0, 1, 2, 3, 4, 5], dtype=int64),
 array([32, 24, 13, 11,  5, 15], dtype=int64))

# Most Unique Words in Clusters

Though there could be many unique words in each cluster, we choose ones which have a frequency of 1 to ensure max uniqueness.

In [21]:
unique_words = freq_df.reset_index()
unique_words = unique_words[unique_words['Count'] == 1]

In [22]:
vectors_cluster = []
for label_index in range(6):
    vectors_cluster.append(vectors[labels == label_index].sum(axis=0))
vectors_cluster = np.array(vectors_cluster)

In [23]:
unique_one_hot = np.transpose(vectors_cluster[:, unique_words.index])
unique_cluster_labels = []
for one_hot in unique_one_hot:
    unique_cluster_labels.append(np.where(one_hot == 1)[0][0])
unique_words['Cluster'] = unique_cluster_labels

In [24]:
unique_words['Cluster'].value_counts()

0    483
1    324
2    248
5    209
3    107
4     60
Name: Cluster, dtype: int64

In [25]:
print('Unique Words')
for label_index in range(6):
    top_unique_words = list(unique_words[unique_words['Cluster'] == label_index]['index'])
    print('Cluster ' + str(label_index) + ': ' + ', '.join(top_unique_words[-3:]))

Unique Words
Cluster 0: advantageous, administration, designing
Cluster 1: redundant, accidental, reputed
Cluster 2: converge, autoinflammatory, contrary
Cluster 3: begins, opinion, displays
Cluster 4: whereas, examination, accompany
Cluster 5: world, amyloid-ß, class
