In [None]:
%pip install sentence-transformers

In [None]:
%pip install seaborn

In [None]:
from tinydb import TinyDB, Query

db = TinyDB('db.json')
table = db.table('articles')

articles = table.all()

print(f'loaded {len(articles)} articles')

sentences = [x['abstract'] for x in articles if x['abstract'] != 'No abstract available.']

print(f'retaining {len(sentences)} articles')

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/allenai-specter')
embeddings = model.encode(sentences)

print(embeddings)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import seaborn as sns

# Reduce embedding dimensionality using t-SNE for visualization
viz = TSNE(n_components=2, max_iter=10000).fit_transform(embeddings)
# viz_embeddings = TSNE(n_components=2, perplexity=100, n_iter=10000).fit_transform(embeddings)

# Plot
sns.scatterplot(x=viz[:, 0], y=viz[:, 1], s=100)
# sns.scatterplot(x=viz_embeddings[:, 0], y=viz_embeddings[:, 1], hue=labels, palette=palette, s=100)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
pca = pca.fit(embeddings)
pca = pca.transform(embeddings)

fig = plt.figure(figsize=(8,3))
plt.scatter(pca[:,0],pca[:,1])
plt.show()

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

X = embeddings

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

centers = kmeans.cluster_centers_

plt.scatter(X[:, 0], X[:, 1])
plt.scatter(centers[:, 0], centers[:, 1], marker='x', color='red')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

linkage_matrix = linkage(embeddings, method='ward')

dendrogram(linkage_matrix)
plt.show()

model = AgglomerativeClustering(n_clusters=8, linkage='ward')
model.fit(embeddings)

labels = model.labels_
print(labels)