In [1]:
import os

import torch
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans

from preprocess import get_csv

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [3]:
# Set the file path for your CSV file
file_path = 'arxiv-metadata-oai-snapshot.csv'

# Check if the file exists
if not os.path.exists(file_path):
    # If the file does not exist, call the get_csv method
    print("Creating CSV from JSON file.")
    get_csv()
else:
    print("The file already exists.")

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to confirm it's loaded correctly
print(df.head())

The file already exists.
                                               title       categories  \
0  Calculation of prompt diphoton production cros...           hep-ph   
1           Sparsity-certifying Graph Decompositions    math.CO cs.CG   
2  The evolution of the Earth-Moon system based o...   physics.gen-ph   
3  A determinant of Stirling cycle numbers counts...          math.CO   
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...  math.CA math.FA   

                                            abstract    versions  
0  A fully differential calculation in perturbati...  2007-07-24  
1  We describe a new algorithm, the $(k,\ell)$-pe...  2008-12-13  
2  The evolution of Earth-Moon system is describe...  2008-01-13  
3  We show that a determinant of Stirling cycle n...  2007-03-31  
4  In this paper we show how to compute the $\Lam...  2007-04-02  


In [26]:
corpus = df["abstract"]
title = df["title"]

# Set the file path for your CSV file
embeddings_path = 'embeddings.npy'

# Check if the file exists
if not os.path.exists(embeddings_path):
    # If the file does not exist, call the get_embedding method
    print("Creating CSV from JSON file.")
    corpus_embeddings = model.encode(corpus, show_progress_bar=True)
    np.save("./embeddings.npy", corpus_embeddings, allow_pickle=True)
else:
    print("The file already exists.")
    corpus_embeddings = np.load("./embeddings.npy", allow_pickle=True)

print(corpus_embeddings.shape)

The file already exists.
(2367176, 384)


In [11]:
### We need to make this cell into a function.

num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(title[sentence_id])

# for i, cluster in enumerate(clustered_sentences):
#     print("Cluster ", i + 1)
#     print(cluster)
#     print("")

In [23]:
df["clustering-{}".format(num_clusters)] = cluster_assignment
df

Unnamed: 0,title,categories,abstract,versions,clustering-20
0,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturbati...,2007-07-24,6
1,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-pe...",2008-12-13,17
2,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is describe...,2008-01-13,8
3,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle n...,2007-03-31,17
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\Lam...,2007-04-02,18
...,...,...,...,...,...
2367171,On the origin of the irreversibility line in t...,supr-con cond-mat.supr-con,We report on measurements of the angular depen...,1996-08-26,12
2367172,Nonlinear Response of HTSC Thin Film Microwave...,supr-con cond-mat.supr-con,The non-linear microwave surface impedance of ...,1996-08-31,12
2367173,Critical State Flux Penetration and Linear Mic...,supr-con cond-mat.supr-con,The vortex contribution to the dc field (H) de...,1996-09-03,12
2367174,Density of States and NMR Relaxation Rate in A...,supr-con cond-mat.supr-con,We show that the density of states in an aniso...,1996-09-18,12


In [24]:
df.to_csv("arxiv-metadata-oai-snapshot-clustering-{}.csv".format(num_clusters), index=False)

In [25]:
df = pd.read_csv("arxiv-metadata-oai-snapshot-clustering-{}.csv".format(num_clusters))
df

Unnamed: 0,title,categories,abstract,versions,clustering-20
0,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturbati...,2007-07-24,6
1,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-pe...",2008-12-13,17
2,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is describe...,2008-01-13,8
3,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle n...,2007-03-31,17
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\Lam...,2007-04-02,18
...,...,...,...,...,...
2367171,On the origin of the irreversibility line in t...,supr-con cond-mat.supr-con,We report on measurements of the angular depen...,1996-08-26,12
2367172,Nonlinear Response of HTSC Thin Film Microwave...,supr-con cond-mat.supr-con,The non-linear microwave surface impedance of ...,1996-08-31,12
2367173,Critical State Flux Penetration and Linear Mic...,supr-con cond-mat.supr-con,The vortex contribution to the dc field (H) de...,1996-09-03,12
2367174,Density of States and NMR Relaxation Rate in A...,supr-con cond-mat.supr-con,We show that the density of states in an aniso...,1996-09-18,12
