In [None]:
import pickle
import spacy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import networkx as nx

In [None]:
DATA_FOLDER = '../../data/'

In [None]:
with open(f'{DATA_FOLDER}papers.pickle', 'rb') as handle:
    papers = pickle.load(handle)

In [None]:
papers_df = pd.DataFrame.from_dict(papers)

In [None]:
papers_df

In [None]:
papers_df['paper_id'] = np.arange(0, len(papers_df))

In [None]:
papers_df = papers_df.dropna(subset=['abstract'])  # Remove entries with missing abstracts

In [None]:
# Extract abstracts and identifiers
abstracts = papers_df['abstract'].tolist()
paper_ids = papers_df['paper_id'].tolist()

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_md")

In [None]:
# Compute similarity matrix
docs = [nlp(abstract) for abstract in abstracts]
n = len(docs)
similarity_matrix = np.zeros((n, n))

In [None]:
for i in range(n):
    for j in range(n):
        similarity_matrix[i, j] = docs[i].similarity(docs[j])


In [None]:
# Visualize: Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, xticklabels=paper_ids, yticklabels=paper_ids, cmap="viridis")
plt.title("Abstract Similarity Heatmap")
plt.xlabel("Paper ID")
plt.ylabel("Paper ID")
plt.show()

In [None]:
# Visualize: Graph
threshold = 0.7  # Adjust threshold as needed
G = nx.Graph()

# Add nodes with paper IDs
for paper_id in paper_ids:
    G.add_node(paper_id)

# Add edges based on similarity threshold
for i in range(n):
    for j in range(i + 1, n):
        if similarity_matrix[i, j] > threshold:
            G.add_edge(paper_ids[i], paper_ids[j], weight=similarity_matrix[i, j])

# Draw the graph
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, labels={pid: pid for pid in paper_ids},
        node_color="lightblue", node_size=500, font_size=10)
plt.title("Abstract Similarity Graph")
plt.show()

In [None]:
# Visualize: Dimensionality Reduction with Annotations
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(similarity_matrix)

plt.figure(figsize=(12, 8))
sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], s=100, color='pink')

# Annotate points with paper_id
for i, paper_id in enumerate(paper_ids):
    plt.text(tsne_results[i, 0] + 0.04, tsne_results[i, 1] + 0.04,  # Offset for readability
             str(paper_id), fontsize=9, ha='center', va='center')

plt.title("Abstract Similarity (t-SNE Projection)")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()

# Using Plotly for Interactive Graph Visualization

In [None]:
import plotly.graph_objects as go
import networkx as nx

# Create a graph
threshold = 0.7  # Adjust threshold as needed
G = nx.Graph()

# Add nodes with paper IDs
for paper_id in paper_ids:
    G.add_node(paper_id, label=str(paper_id))

# Add edges based on similarity threshold
for i in range(n):
    for j in range(i + 1, n):
        if similarity_matrix[i, j] > threshold:
            G.add_edge(paper_ids[i], paper_ids[j], weight=similarity_matrix[i, j])

# Get node positions
pos = nx.spring_layout(G)

# Prepare data for Plotly
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

node_x = []
node_y = []
node_labels = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_labels.append(f"Paper ID: {node}")

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_labels,
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        color='blue',
        size=10,
        line=dict(width=2)
    )
)

# Create Plotly Figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title="Abstract Similarity Graph",
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0, l=0, r=0, t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
                )

fig.show()


# Using PyVis for Interactive Graph Visualization


In [None]:
from pyvis.network import Network

# Create a PyVis network
net = Network(notebook=True, height="800px", width="100%", bgcolor="#222222", font_color="white")

# Add nodes with labels
for node in G.nodes(data=True):
    net.add_node(node[0], label=str(node[0]), title=f"Paper ID: {node[0]}", color="lightblue")

# Add edges with weights
for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], value=edge[2]['weight'])

# Customize layout
net.force_atlas_2based()
net.show_buttons(filter_=['physics'])
net.show("similarity_graph.html")