In [6]:
import numpy as np
import torch
import json
from sentence_transformers import util
def search_papers(title, abstract, model_name):
    f = open(f'../embeddings/high_dimension/{model_name}_high_dim.json')
    data = json.load(f)

    query_embedding = torch.as_tensor(data[title])

    corpus_embedding = torch.tensor(list(data.values()))

    titles = list(data.keys())

    search_hits = util.semantic_search(query_embedding,corpus_embedding)
    search_hits = search_hits[0]  # Get the hits for the first query

    print("\n\nPaper:", title)
    print("Most similar papers:")
    for hit in search_hits:
        print(hit)
        related_paper = titles[hit['corpus_id']]
        print("{:.2f}\t{}".format(hit['score'], related_paper))

In [7]:
search_papers("Visual Analytics in Digital Pathology: Challenges and Opportunities", "The advances in high-throughput digitization, digital pathology systems, and quantitative image analysis opened new horizons in pathology. The diagnostic work of the pathologists and their role is likely to be augmented with computer-assistance and more quantitative information at hand. The recent success of artificial intelligence (AI) and computer vision methods demonstrated that in the coming years machines will support pathologists in typically tedious and highly subjective tasks and also in better patient stratification. In spite of clear future improvements in the diagnostic workflow, questions on how to effectively support the pathologists and how to integrate current data sources and quantitative information still persist. In this context, Visual Analytics (VA) - as the discipline that aids users to solve complex problems with an interactive and visual approach - can play a vital role to support the cognitive skills of pathologists and the large volumes of data available. To identify the main opportunities to employ VA in digital pathology systems, we conducted a survey with 20 pathologists to characterize the diagnostic practice and needs from a user perspective. From our findings, we discuss how VA can leverage quantitative image data to empower pathologists with new advanced digital pathology systems.", "allenai_specter")



Paper: Visual Analytics in Digital Pathology: Challenges and Opportunities
Most similar papers:
{'corpus_id': 127, 'score': 0.9999996423721313}
1.00	Visual Analytics in Digital Pathology: Challenges and Opportunities
{'corpus_id': 70, 'score': 0.9584764838218689}
0.96	Visual Analytics in Histopathology Diagnostics: a Protocol-Based Approach
{'corpus_id': 61, 'score': 0.9015693664550781}
0.90	PATHONE: From one Thousand Patients to one Cell
{'corpus_id': 74, 'score': 0.864912748336792}
0.86	Discovering Medical Knowledge Using Visual Analytics
{'corpus_id': 207, 'score': 0.8586125373840332}
0.86	Visual Analytics for the Exploration and Assessment of Segmentation Errors
{'corpus_id': 133, 'score': 0.8464146852493286}
0.85	RegistrationShop: An Interactive 3D Medical Volume Registration System
{'corpus_id': 184, 'score': 0.8454501628875732}
0.85	A Visual Analytics Approach for Patient Stratification and Biomarker Discovery
{'corpus_id': 84, 'score': 0.840019941329956}
0.84	Survey of Labeli

In [54]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('allenai-specter')
f = open(f'../embeddings/high_dimension/allenai_specter_high_dim.json')
data = json.load(f)
doc_titles = list(data.keys())
embeddings = list(data.values())

In [None]:
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

embedder = SentenceTransformer('allenai-specter')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")