# Finding Needles in a Paper Haystack

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/satyaborg/satyaborg.github.io/blob/main/assets/notebooks/paper-haystack.ipynb)

- Code for the [blog post](https://satyaborg.com/posts/research-clustering/) of the same name
- Cluster and visualize papers from NeurIPS 2021

In [None]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

from IPython.display import clear_output

## Data acquisition

In [None]:
conf = 'neurips'
year = '2021' # change this if you want to retrieve papers for another year
base_url = f"https://proceedings.neurips.cc"

In [None]:
html = requests.get(f"{base_url}/paper/{year}")

In [None]:
soup = BeautifulSoup(html.text, 'html.parser')

In [None]:
# this might take a while!
papers = []

start = time.time()

for paper in soup.find_all('li'):
    try:
        title = paper.find_all('a')[0].text
        link = paper.find_all('a')[0].get('href')
        if title:
            papers.append(dict(
                title = title,
                authors=paper.find_all('i')[0].text,
                link=f"{base_url}/{link}",
                conf=conf,
                year=year
            ))
    except Exception as e:
        print(e)
        continue

print(f"Completed in {time.time() - start} secs")

Completed in 0.10488343238830566 secs


In [None]:
df_papers = pd.DataFrame(papers)
print(df_papers.shape)
df_papers.head()

(2334, 5)


Unnamed: 0,title,authors,link,conf,year
0,Beyond Value-Function Gaps: Improved Instance-...,"Christoph Dann, Teodor Vanislavov Marinov, Meh...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021
1,Learning One Representation to Optimize All Re...,"Ahmed Touati, Yann Ollivier",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021
2,Matrix factorisation and the interpretation of...,"Nick Whiteley, Annie Gray, Patrick Rubin-Delanchy",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021
3,UniDoc: Unified Pretraining Framework for Docu...,"Jiuxiang Gu, Jason Kuen, Vlad I Morariu, Hando...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021
4,Finding Discriminative Filters for Specific De...,"Liangbin Xie, Xintao Wang, Chao Dong, Zhongang...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021


Get the abstracts

In [None]:
abstracts = []

start = time.time()

for paper in papers:
    try:
        link = paper['link']
        paper_details = requests.get(link)
        paper_soup = BeautifulSoup(paper_details.text, 'html.parser')
        
        headings = paper_soup.find_all('h4')
        headings_text = [h.text.strip().lower() for h in headings]
        i = headings_text.index('abstract') # match h4 with Abstract

        # abstract is always 2 siblings down
        next = headings[i].next_sibling
        next = next.next_sibling

        abstract = next.text
        abstracts.append(abstract)

    except Exception as e:
        print(e, paper['title'], paper['link'])
        abstracts.append(None)
        continue

print(f"Completed in {time.time() - start} secs")

Completed in 281.3236002922058 secs


Clean the HTML tags

In [None]:
df_papers['abstract'] = abstracts
df_papers['abstract'] = df_papers.abstract.apply(lambda x: BeautifulSoup(x, "lxml").text)

Concatenate `title` + `abstract` to form the `body`

In [None]:
df_papers['body'] = df_papers.apply(lambda x: f"{x.title} {x.abstract}", axis=1)

In [None]:
df_papers

Unnamed: 0,title,authors,link,conf,year,abstract,body
0,Beyond Value-Function Gaps: Improved Instance-...,"Christoph Dann, Teodor Vanislavov Marinov, Meh...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,We provide improved gap-dependent regret bound...,Beyond Value-Function Gaps: Improved Instance-...
1,Learning One Representation to Optimize All Re...,"Ahmed Touati, Yann Ollivier",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,We introduce the forward-backward (FB) represe...,Learning One Representation to Optimize All Re...
2,Matrix factorisation and the interpretation of...,"Nick Whiteley, Annie Gray, Patrick Rubin-Delanchy",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,"Given a graph or similarity matrix, we conside...",Matrix factorisation and the interpretation of...
3,UniDoc: Unified Pretraining Framework for Docu...,"Jiuxiang Gu, Jason Kuen, Vlad I Morariu, Hando...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,Document intelligence automates the extraction...,UniDoc: Unified Pretraining Framework for Docu...
4,Finding Discriminative Filters for Specific De...,"Liangbin Xie, Xintao Wang, Chao Dong, Zhongang...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,Recent blind super-resolution (SR) methods typ...,Finding Discriminative Filters for Specific De...
...,...,...,...,...,...,...,...
2329,Unlabeled Principal Component Analysis,"Yunzhen Yao, Liangzu Peng, Manolis Tsakiris",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,We introduce robust principal component analys...,Unlabeled Principal Component Analysis We intr...
2330,Causal-BALD: Deep Bayesian Active Learning of ...,"Andrew Jesson, Panagiotis Tigas, Joost van Ame...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,Estimating personalized treatment effects from...,Causal-BALD: Deep Bayesian Active Learning of ...
2331,Scalable Rule-Based Representation Learning fo...,"Zhuo Wang, Wei Zhang, Ning Liu, Jianyong Wang",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,"Rule-based models, e.g., decision trees, are w...",Scalable Rule-Based Representation Learning fo...
2332,Bridging Non Co-occurrence with Unlabeled In-t...,"NA DONG, Yongqiang Zhang, Mingli Ding, Gim Hee...",https://proceedings.neurips.cc//paper/2021/has...,neurips,2021,Deep networks have shown remarkable results in...,Bridging Non Co-occurrence with Unlabeled In-t...


In [None]:
# optional
# df_papers.to_csv('neurips21.csv', index=False)

## Clustering

In [None]:
!pip install bertopic
!pip install --upgrade joblib==1.1.0
clear_output()

In [None]:
docs = df_papers.body.tolist()
docs = list(map(lambda x: str(x), docs))

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from umap import UMAP

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
# Train BERTopic
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model).fit(docs, embeddings)

In [None]:
topic_model.get_topics()

{-1: [('data', 0.009940528789107237),
  ('learning', 0.009240378930968872),
  ('model', 0.008843291252728142),
  ('models', 0.00859652959006869),
  ('neural', 0.007054295430212847),
  ('training', 0.006598818004227983),
  ('methods', 0.006444560054011029),
  ('networks', 0.006246437679664433),
  ('method', 0.0060055804780751185),
  ('propose', 0.005686909835149047)],
 0: [('learning', 0.015551546622991217),
  ('reinforcement', 0.014286977495735929),
  ('reinforcement learning', 0.014256522062384607),
  ('policy', 0.014161268634678273),
  ('rl', 0.013332452602941592),
  ('agents', 0.010305083410798648),
  ('algorithm', 0.01014509558938816),
  ('regret', 0.009715104669111393),
  ('algorithms', 0.009061469336128544),
  ('problem', 0.008331993892280962)],
 1: [('graph', 0.05848209255457942),
  ('gnns', 0.02634758614964423),
  ('graphs', 0.022775285586574035),
  ('graph neural', 0.02020216910739186),
  ('node', 0.01876000048381857),
  ('gnn', 0.017337038903555244),
  ('networks', 0.01523007

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
#@title Function override
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from umap import UMAP
from typing import List


def visualize_documents(topic_model,
                        docs: List[str],
                        topics: List[int] = None,
                        embeddings: np.ndarray = None,
                        reduced_embeddings: np.ndarray = None,
                        sample: float = None,
                        hide_annotations: bool = False,
                        hide_document_hover: bool = False,
                        hover_text_labels: List[str] = None,
                        custom_labels: bool = False,
                        width: int = 1200,
                        height: int = 750):
    """ Visualize documents and their topics in 2D
    Arguments:
        topic_model: A fitted BERTopic instance.
        docs: The documents you used when calling either `fit` or `fit_transform`
        topics: A selection of topics to visualize.
                Not to be confused with the topics that you get from `.fit_transform`.
                For example, if you want to visualize only topics 1 through 5:
                `topics = [1, 2, 3, 4, 5]`.
        embeddings: The embeddings of all documents in `docs`.
        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
        sample: The percentage of documents in each topic that you would like to keep.
                Value can be between 0 and 1. Setting this value to, for example,
                0.1 (10% of documents in each topic) makes it easier to visualize
                millions of documents as a subset is chosen.
        hide_annotations: Hide the names of the traces on top of each cluster.
        hide_document_hover: Hide the content of the documents when hovering over
                             specific points. Helps to speed up generation of visualization.
        custom_labels: Whether to use custom topic labels that were defined using 
                       `topic_model.set_topic_labels`.
        width: The width of the figure.
        height: The height of the figure.
    Examples:
    To visualize the topics simply run:
    ```python
    topic_model.visualize_documents(docs)
    ```
    Do note that this re-calculates the embeddings and reduces them to 2D.
    The advised and prefered pipeline for using this function is as follows:
    ```python
    from sklearn.datasets import fetch_20newsgroups
    from sentence_transformers import SentenceTransformer
    from bertopic import BERTopic
    from umap import UMAP
    # Prepare embeddings
    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = sentence_model.encode(docs, show_progress_bar=False)
    # Train BERTopic
    topic_model = BERTopic().fit(docs, embeddings)
    # Reduce dimensionality of embeddings, this step is optional
    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
    # Run the visualization with the original embeddings
    topic_model.visualize_documents(docs, embeddings=embeddings)
    # Or, if you have reduced the original embeddings already:
    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
    ```
    Or if you want to save the resulting figure:
    ```python
    fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/documents.html"
    style="width:1000px; height: 800px; border: 0px;""></iframe>
    """
    topic_per_doc = topic_model.topics_

    # Sample the data to optimize for visualization and dimensionality reduction
    if sample is None or sample > 1:
        sample = 1

    indices = []
    for topic in set(topic_per_doc):
        s = np.where(np.array(topic_per_doc) == topic)[0]
        size = len(s) if len(s) < 100 else int(len(s) * sample)
        indices.extend(np.random.choice(s, size=size, replace=False))
    indices = np.array(indices)

    df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]})
    df["doc"] = [docs[index] for index in indices]
    df['doc_label'] = [hover_text_labels[index] for index in indices] # NB: added for hover text
    df["topic"] = [topic_per_doc[index] for index in indices]

    # Extract embeddings if not already done
    if sample is None:
        if embeddings is None and reduced_embeddings is None:
            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
        else:
            embeddings_to_reduce = embeddings
    else:
        if embeddings is not None:
            embeddings_to_reduce = embeddings[indices]
        elif embeddings is None and reduced_embeddings is None:
            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")

    # Reduce input embeddings
    if reduced_embeddings is None:
        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)
        embeddings_2d = umap_model.embedding_
    elif sample is not None and reduced_embeddings is not None:
        embeddings_2d = reduced_embeddings[indices]
    elif sample is None and reduced_embeddings is not None:
        embeddings_2d = reduced_embeddings

    unique_topics = set(topic_per_doc)
    if topics is None:
        topics = unique_topics

    # Combine data
    df["x"] = embeddings_2d[:, 0]
    df["y"] = embeddings_2d[:, 1]

    # Prepare text and names
    if topic_model.custom_labels_ is not None and custom_labels:
        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]
    else:
        names = [f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics]

    # Visualize
    fig = go.Figure()

    # Outliers and non-selected topics
    non_selected_topics = set(unique_topics).difference(topics)
    if len(non_selected_topics) == 0:
        non_selected_topics = [-1]

    selection = df.loc[df.topic.isin(non_selected_topics), :]
    selection["text"] = ""
    selection.loc[len(selection), :] = [None, None, None, selection.x.mean(), selection.y.mean(), "Other documents"]

    fig.add_trace(
        go.Scattergl(
            x=selection.x,
            y=selection.y,
            hovertext=selection.doc_label if not hide_document_hover else None,
            hoverinfo="text",
            mode='markers+text',
            name="other",
            showlegend=False,
            marker=dict(color='#CFD8DC', size=5, opacity=0.5)
        )
    )

    # Selected topics
    for name, topic in zip(names, unique_topics):
        if topic in topics and topic != -1:
            selection = df.loc[df.topic == topic, :]
            selection["text"] = ""

            if not hide_annotations:
                selection.loc[len(selection), :] = [None, None, None, selection.x.mean(), selection.y.mean(), name]

            fig.add_trace(
                go.Scattergl(
                    x=selection.x,
                    y=selection.y,
                    hovertext=selection.doc_label if not hide_document_hover else None,
                    hoverinfo="text",
                    text=selection.text,
                    mode='markers+text',
                    name=name,
                    textfont=dict(
                        size=12,
                    ),
                    marker=dict(size=5, opacity=0.5)
                )
            )

    # Add grid in a 'plus' shape
    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)

    # Stylize layout
    fig.update_layout(
        template="simple_white",
        title={
            'text': "<b>Documents and Topics",
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height
    )

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    return fig

In [None]:
titles = df_papers.title.tolist() # as hover text labels
assert len(docs) == len(titles)

fig = visualize_documents(topic_model,
                        docs=docs,
                        reduced_embeddings=reduced_embeddings,
                        hover_text_labels=titles,
                        width=1500,
                        height=900
                        )

In [None]:
fig

In [None]:
fig.write_html(f"neurips-{year}.html") # export to HTML

EOF