# Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import requests

In [74]:
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import cosine

# Scraping

In [3]:
def get_paper_hyperlinks(query):
    url = 'https://pubmed.ncbi.nlm.nih.gov/?term=' + '+'.join(query.split()) + '&filter=simsearch1.fha&size=100'
    search_results = requests.get(url)
    page_source = BeautifulSoup(search_results.content, 'html.parser')
    papers = page_source.findAll('a', {'class': "docsum-title"})
    hyperlinks = ['https://pubmed.ncbi.nlm.nih.gov' + paper['href'] for paper in papers]
    return hyperlinks

In [4]:
def get_abstract(url):
    paper_page = requests.get(url)
    page_source = BeautifulSoup(paper_page.content, 'html.parser')
    title = page_source.find('h1', {'class': 'heading-title'}).text.strip()
    abstract = page_source.find('div', {'id': 'enc-abstract'}).text.strip()
    return [title, url, abstract]

In [5]:
def scrape_pages(query, n):
    links = get_paper_hyperlinks(query)[:n]
    abstracts = [get_abstract(link) for link in tqdm(links)]
    scraped_data = pd.DataFrame(abstracts, columns=['Title', 'URL', 'Abstract'])
    return scraped_data

In [6]:
scrape = False
query = 'neurodegenerative diseases'
n = 100

In [7]:
if scrape:
    data = scrape_pages(query, n)
    data.to_excel('abstracts.xlsx', index=False)
else:
    data = pd.read_excel('abstracts.xlsx')

In [8]:
print(data['Abstract'][0] + '\n\n' + data['Abstract'][1])

Neurodegenerative disorders are characterized by progressive loss of selectively vulnerable populations of neurons, which contrasts with select static neuronal loss because of metabolic or toxic disorders. Neurodegenerative diseases can be classified according to primary clinical features (e.g., dementia, parkinsonism, or motor neuron disease), anatomic distribution of neurodegeneration (e.g., frontotemporal degenerations, extrapyramidal disorders, or spinocerebellar degenerations), or principal molecular abnormality. The most common neurodegenerative disorders are amyloidoses, tauopathies, α-synucleinopathies, and TDP-43 proteinopathies. The protein abnormalities in these disorders have abnormal conformational properties. Growing experimental evidence suggests that abnormal protein conformers may spread from cell to cell along anatomically connected pathways, which may in part explain the specific anatomical patterns observed at autopsy. In this review, we detail the human pathology o

# Corpus Cleaning

In [9]:
def process_text(text):
    symbols = [',', '.', ':', '?', '/', ';', '[', ']', '(', ')', '&']
    symbols_rep = [' ', ' ', ' ', ' ', ' ', ' ' , '', '', '', '', '']
    for symbol, symbol_rep in zip(symbols, symbols_rep):
        text = text.replace(symbol, symbol_rep)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [10]:
def get_freq(processed_data):
    freq = {}
    for text in processed_data:
        for word in text.split():
            if (word in freq):
                freq[word] += 1
            else:
                freq[word] = 1
    freq_df = pd.DataFrame.from_dict(freq, orient='index', columns=['Count'])
    freq_df.sort_values(by='Count', ascending=False, inplace=True)
    return [freq, freq_df]

In [11]:
processed_data = [process_text(text) for text in data['Abstract']]
freq, freq_df = get_freq(processed_data)
removed_words = list(freq_df[:10].index)

In [12]:
freq_df = freq_df[10:]
for removed_word in removed_words:
    del freq[removed_word]

In [13]:
print(removed_words)

['the', 'of', 'and', 'in', 'diseases', 'neurodegenerative', 'to', 'a', 'disease', 'are']


# Abstract Vectorization

In [14]:
def compute_vector(text, freq, freq_df):
    freq_abstract = dict.fromkeys(freq, 0)
    for word in text.split():
        if (word in freq):
            freq_abstract[word] += 1
    vector = []
    for word in freq_df.index:
        vector.append(freq_abstract[word])
    return vector

In [15]:
def get_vectors(processed_data, freq, freq_df):
    vectors = []
    for text in processed_data:
        vectors.append(compute_vector(text, freq, freq_df))
    return np.array(vectors)

In [16]:
vectors = get_vectors(processed_data, freq, freq_df)

In [29]:
print(vectors[0], vectors[1])

[0 0 1 ... 0 0 0] [1 4 2 ... 0 0 0]


# Clustering (DBSCAN with Cosine Similarity)

In [161]:
model = DBSCAN(eps=0.7, min_samples=2, metric=cosine)
model.fit(vectors)

DBSCAN(algorithm='auto', eps=0.7, leaf_size=30,
       metric=<function cosine at 0x000000DC7FABAE58>, metric_params=None,
       min_samples=2, n_jobs=None, p=None)

In [162]:
model.labels_

array([ 0,  0,  1,  0, -1,  0, -1, -1,  0,  0, -1, -1,  2,  0,  0, -1,  3,
        0,  0, -1, -1,  0,  0, -1,  0,  1,  0, -1, -1, -1,  2,  0, -1,  0,
        0,  1, -1,  0,  0, -1,  0,  0,  0, -1,  0,  0, -1, -1, -1,  0, -1,
        4, -1, -1, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0, -1,  0, -1,  0, -1, -1,  0, -1,  0,  3,  0, -1, -1,
        0,  0,  0,  0,  4,  0,  0,  0,  0, -1,  0,  0, -1,  3,  0],
      dtype=int64)

In [164]:
np.unique(model.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3,  4], dtype=int64),
 array([33, 57,  3,  2,  3,  2], dtype=int64))

eps = 0.7, n_samples = 2

In [89]:
x = 1 - cosine_similarity(vectors)

In [94]:
x.flatten().mean()

0.8267062785555128