In [1]:
import pandas as pd
from arxiv_api import arxiv_query

import string

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = arxiv_query('quant-ph','20240201','20240209')

In [3]:
df

Unnamed: 0,Title,ID,Published,Updated,Summary,Author,Comments,Journal_Ref,Link,Primary_Category,Categories,DOI,License,Affiliation
0,Non-Markovian Dynamics in Fiber Delay-line Buf...,http://arxiv.org/abs/2402.00274v1,2024-02-01T01:50:51Z,2024-02-01T01:50:51Z,We study the non-Markovian effect on a two-p...,"[\nKim Fook Lee\n, \nPrem Kumar\n]","21 pages, 4 figures",,http://arxiv.org/abs/2402.00274v1,quant-ph,[quant-ph],,,[]
1,Practical No-Switching Continuous-Variable Qua...,http://arxiv.org/abs/2402.00277v1,2024-02-01T02:02:56Z,2024-02-01T02:02:56Z,Continuous-variable quantum key distribution...,"[\nJiale Mi\n, \nYiming Bian\n, \nLu Fan\n, \n...","15 pages, 10 figures",,http://arxiv.org/abs/2402.00277v1,quant-ph,[quant-ph],,,[]
2,A study of chaos and randomness in quantum sys...,http://arxiv.org/abs/2402.00287v1,2024-02-01T02:35:01Z,2024-02-01T02:35:01Z,How classical chaos emerges from the underly...,[\nSreeram PG\n],PhD Thesis,,http://arxiv.org/abs/2402.00287v1,quant-ph,[quant-ph],,,[]
3,Quantum Information Geometry with Non-Hermitia...,http://arxiv.org/abs/2402.00374v1,2024-02-01T06:28:21Z,2024-02-01T06:28:21Z,Information geometry is the application of d...,"[\nWangjun Lu\n, \nZhao-Hui Peng\n, \n HongTao\n]",,,http://arxiv.org/abs/2402.00374v1,quant-ph,[quant-ph],,,[]
4,Error-Tolerant Amplification and Simulation of...,http://arxiv.org/abs/2402.00379v1,2024-02-01T06:55:59Z,2024-02-01T06:55:59Z,Cat-state qubits formed by photonic cat stat...,"[\nYe-Hong Chen\n, \nZhi-Cheng Shi\n, \nFranco...","7 pages, 7 figures, comments are welcome",,http://arxiv.org/abs/2402.00379v1,quant-ph,[quant-ph],,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,Designing three-way entangled and nonlocal two...,http://arxiv.org/abs/2402.05080v2,2024-02-07T18:33:28Z,2024-02-08T06:02:20Z,Entanglement with single-particle states is ...,"[\nDinesh Kumar Panda\n, \nColin Benjamin\n]","13 pages, 10 figures",,http://arxiv.org/abs/2402.05080v2,quant-ph,"[quant-ph, cond-mat.dis-nn, cs.SY, eess.SY, ph...",,,[]
271,Resources of the Quantum World,http://arxiv.org/abs/2402.05474v1,2024-02-08T08:05:02Z,2024-02-08T08:05:02Z,This book delves into the burgeoning field o...,[\nGilad Gour\n],"956 Pages (including appendices), Preliminary ...",,http://arxiv.org/abs/2402.05474v1,quant-ph,"[quant-ph, cs.IT, math-ph, math.IT, math.MP]",,,[]
272,The $φ^n$ trajectory bootstrap,http://arxiv.org/abs/2402.05778v1,2024-02-08T16:09:06Z,2024-02-08T16:09:06Z,The Green's functions $G_n=\langle\phi^n\ran...,[\nWenliang Li\n],"21 pages, 9 figures",,http://arxiv.org/abs/2402.05778v1,hep-th,"[hep-th, cond-mat.stat-mech, hep-lat, nucl-th,...",,,[]
273,Magic Class and the Convolution Group,http://arxiv.org/abs/2402.05780v1,2024-02-08T16:12:16Z,2024-02-08T16:12:16Z,The classification of many-body quantum stat...,"[\nKaifeng Bu\n, \nArthur Jaffe\n, \nZixia Wei\n]",6+2 pages,,http://arxiv.org/abs/2402.05780v1,quant-ph,"[quant-ph, cond-mat.stat-mech, hep-th, math-ph...",,,[]


In [4]:
df['Summary']

0        We study the non-Markovian effect on a two-p...
1        Continuous-variable quantum key distribution...
2        How classical chaos emerges from the underly...
3        Information geometry is the application of d...
4        Cat-state qubits formed by photonic cat stat...
                             ...                        
270      Entanglement with single-particle states is ...
271      This book delves into the burgeoning field o...
272      The Green's functions $G_n=\langle\phi^n\ran...
273      The classification of many-body quantum stat...
274      Quantum entanglement is a fundamentally non-...
Name: Summary, Length: 275, dtype: object

In [5]:
def simple_cleaner(my_string):
    #remove punctuation
    my_string = my_string.translate(str.maketrans(" "," ",string.punctuation))
    
    #lower case, split, remove stopwords
    my_string = [w for w in my_string.lower().split() if w not in stop_words]
    
    return " ".join(my_string)

In [6]:
df['cleaned_text'] = df['Summary'].apply(simple_cleaner)

In [7]:
tfidf_text = TfidfVectorizer(min_df=5, max_df=0.7)

vectors_text = tfidf_text.fit_transform(df['cleaned_text'])

In [8]:
vectors_text.shape

(275, 1092)

### NMF

In [9]:
from sklearn.decomposition import NMF

In [10]:
nmf_text_model = NMF(n_components=10)

W_text_matrix = nmf_text_model.fit_transform(vectors_text)
H_text_matrix = nmf_text_model.components_

In [11]:
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [12]:
display_topics(nmf_text_model, tfidf_text.get_feature_names_out())


Topic 00
  energy (2.09)
  model (1.63)
  wave (1.27)
  equation (1.20)
  harmonic (1.19)

Topic 01
  classical (1.94)
  algorithm (1.85)
  problem (1.65)
  optimization (1.13)
  algorithms (0.90)

Topic 02
  states (5.91)
  state (3.07)
  graph (2.72)
  operations (1.06)
  entangled (1.02)

Topic 03
  qubits (2.93)
  qubit (2.51)
  errors (2.00)
  error (1.71)
  superconducting (1.70)

Topic 04
  topological (6.00)
  phase (3.00)
  phases (2.96)
  magnetic (1.75)
  boundary (1.73)

Topic 05
  entanglement (8.14)
  negativity (1.22)
  entropy (1.19)
  correlations (0.91)
  measurement (0.90)

Topic 06
  optical (1.70)
  mode (1.47)
  coupling (1.27)
  cavity (1.16)
  field (1.15)

Topic 07
  key (4.09)
  protocol (2.97)
  rate (2.67)
  secret (2.53)
  protocols (2.35)

Topic 08
  systems (2.83)
  dynamics (1.90)
  spin (1.68)
  control (1.03)
  open (0.91)

Topic 09
  neural (3.25)
  models (1.94)
  networks (1.85)
  model (1.73)
  learning (1.71)


### SVDf

In [13]:
from sklearn.decomposition import TruncatedSVD

In [14]:
svd_text_model = TruncatedSVD(n_components=10)

W_svd_text_matrix = svd_text_model.fit_transform(vectors_text)
H_svd_text_matrix = svd_text_model.components_

In [15]:
display_topics(svd_text_model, tfidf_text.get_feature_names_out())


Topic 00
  states (0.75)
  entanglement (0.70)
  state (0.56)
  systems (0.51)
  model (0.45)

Topic 01
  entanglement (12.38)
  states (5.97)
  topological (5.27)
  state (5.01)
  phase (4.05)

Topic 02
  states (11.59)
  entanglement (11.49)
  graph (7.58)
  state (5.31)
  protocol (3.85)

Topic 03
  qubit (21.20)
  qubits (19.69)
  superconducting (16.32)
  errors (12.02)
  photon (11.38)

Topic 04
  topological (23.44)
  states (12.43)
  phases (10.90)
  phase (9.35)
  graph (8.66)

Topic 05
  entanglement (211.04)
  topological (119.99)
  phase (91.32)
  phases (60.46)
  lattice (49.62)

Topic 06
  neural (62.43)
  optical (58.02)
  networks (47.43)
  topological (46.61)
  mode (37.66)

Topic 07
  key (8.76)
  protocol (7.27)
  secret (7.05)
  rate (6.89)
  codes (6.59)

Topic 08
  systems (32.62)
  spin (19.69)
  dynamics (18.98)
  states (16.24)
  control (15.43)

Topic 09
  model (233.00)
  models (212.50)
  neural (159.13)
  decoherence (137.76)
  rate (137.37)


### LDA

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
count_text_vectorizer = CountVectorizer(min_df=5, max_df=0.7)

count_text_vectors = count_text_vectorizer.fit_transform(df['cleaned_text'])

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

In [19]:
lda_text_model = LatentDirichletAllocation(n_components=6)

W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
H_lda_text_matrix = lda_text_model.components_

In [45]:
pd.DataFrame(W_lda_text_matrix)[0].sort_values(ascending=False)

139    0.992163
27     0.991052
132    0.989614
159    0.988468
121    0.987826
         ...   
2      0.001538
58     0.001535
178    0.001467
235    0.001450
105    0.001411
Name: 0, Length: 275, dtype: float64

In [47]:
ranked_list = pd.DataFrame(W_lda_text_matrix)[0].sort_values(ascending=False).index

In [129]:
pd.DataFrame(W_lda_text_matrix)[0].iloc[ranked_list].reset_index(drop=True)

0      0.992163
1      0.991052
2      0.989614
3      0.988468
4      0.987826
         ...   
270    0.001538
271    0.001535
272    0.001467
273    0.001450
274    0.001411
Name: 0, Length: 275, dtype: float64

In [131]:
[i for i in pd.DataFrame(W_lda_text_matrix)[0]]

[0.9828541978571613,
 0.0018636912192771817,
 0.0015384116547627683,
 0.0028552439159449427,
 0.003064400207663696,
 0.0020429259760347714,
 0.5533922372119962,
 0.003167135179428334,
 0.0016475256368910745,
 0.0031665693256508493,
 0.0031767128899725158,
 0.003176928436440112,
 0.0017341929538547676,
 0.0021265141565457424,
 0.0021604816224278386,
 0.0016622955544369584,
 0.1850965499989846,
 0.0018676603257610734,
 0.002432944947490659,
 0.004193325587532155,
 0.0022688068846920036,
 0.003364020450865573,
 0.0033016613721896446,
 0.001625807085439015,
 0.002961964783544094,
 0.0028533895067845666,
 0.002762730514377466,
 0.9910517132145006,
 0.3815398480510734,
 0.002075597989246535,
 0.002851796574512234,
 0.0026541033284699493,
 0.008353706799138862,
 0.0021885011937196656,
 0.002630336296492943,
 0.004399007799340676,
 0.0017441028939672576,
 0.0017995642736541664,
 0.0017150085405913896,
 0.00275024374648161,
 0.0018846517819152424,
 0.0020659288789857946,
 0.003040809618838888,


In [50]:
df.iloc[ranked_list]

Unnamed: 0,Title,ID,Published,Updated,Summary,Author,Comments,Journal_Ref,Link,Primary_Category,Categories,DOI,License,Affiliation,cleaned_text
139,The Casimir effect at the nucleus,http://arxiv.org/abs/2402.01776v1,2024-02-01T14:57:24Z,2024-02-01T14:57:24Z,"In this report, the impact of the Casimir ef...",[\nFrank Kowol\n],"26 pages, 7 figures",,http://arxiv.org/abs/2402.01776v1,physics.atom-ph,"[physics.atom-ph, quant-ph]",,,[],report impact casimir effect nearnuclear envir...
27,Continuously Distributing Entanglement in Quan...,http://arxiv.org/abs/2402.01527v1,2024-02-02T16:14:50Z,2024-02-02T16:14:50Z,Small interconnected quantum processors can ...,"[\nLars Talsma\n, \nÁlvaro G. Iñesta\n, \nStep...",8 pages with 4 figures (main text); 11 pages a...,,http://arxiv.org/abs/2402.01527v1,quant-ph,[quant-ph],,,[],small interconnected quantum processors collab...
132,All graph state verification protocols are com...,http://arxiv.org/abs/2402.01445v1,2024-02-02T14:37:26Z,2024-02-02T14:37:26Z,Graph state verification protocols allow mul...,"[\nLéo Colisson\n, \nDamian Markham\n, \nRaja ...",48 pages,,http://arxiv.org/abs/2402.01445v1,quant-ph,"[quant-ph, cs.CR, 81P94, 94A60]",,,[],graph state verification protocols allow multi...
159,A metronome spin stabilizes time-crystalline d...,http://arxiv.org/abs/2402.04078v1,2024-02-06T15:31:17Z,2024-02-06T15:31:17Z,We investigate a disorder-free quantum Ising...,"[\nNiklas Euler\n, \nAdrian Braemer\n, \nLuca ...",,,http://arxiv.org/abs/2402.04078v1,quant-ph,"[quant-ph, cond-mat.stat-mech]",,,[],investigate disorderfree quantum ising chain s...
121,Quantum Nonlocality: how does Nature do it?,http://arxiv.org/abs/2402.00725v1,2024-02-01T16:16:10Z,2024-02-01T16:16:10Z,We answer the question asked by Nicolas Gisi...,[\nMarian Kupczynski\n],"19 pages, 100 references",,http://arxiv.org/abs/2402.00725v1,quant-ph,"[quant-ph, physics.hist-ph]",,,[],answer question asked nicolas gisin article sc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,A study of chaos and randomness in quantum sys...,http://arxiv.org/abs/2402.00287v1,2024-02-01T02:35:01Z,2024-02-01T02:35:01Z,How classical chaos emerges from the underly...,[\nSreeram PG\n],PhD Thesis,,http://arxiv.org/abs/2402.00287v1,quant-ph,[quant-ph],,,[],classical chaos emerges underlying quantum wor...
58,State Dependent and Independent Uncertainty Re...,http://arxiv.org/abs/2402.03159v1,2024-02-05T16:28:29Z,2024-02-05T16:28:29Z,To understand the direct impact of noncommut...,[\n Sahil\n],"18 pages, no figure",,http://arxiv.org/abs/2402.03159v1,quant-ph,[quant-ph],,,[],understand direct impact noncommutativity inco...
178,Magnetic field effects on the Kitaev model cou...,http://arxiv.org/abs/2402.05516v1,2024-02-08T09:51:37Z,2024-02-08T09:51:37Z,Open quantum systems display unusual phenome...,"[\nKiyu Fukui\n, \nYasuyuki Kato\n, \nYukitosh...","16 pages, 10 figures",,http://arxiv.org/abs/2402.05516v1,cond-mat.str-el,"[cond-mat.str-el, quant-ph]",,,[],open quantum systems display unusual phenomena...
235,Curriculum reinforcement learning for quantum ...,http://arxiv.org/abs/2402.03500v1,2024-02-05T20:33:00Z,2024-02-05T20:33:00Z,The key challenge in the noisy intermediate-...,"[\nYash J. Patel\n, \nAkash Kundu\n, \nMateusz...","32 pages, 11 figures, 6 tables. Accepted at IC...",,http://arxiv.org/abs/2402.03500v1,quant-ph,"[quant-ph, cs.AI, cs.LG]",,,[],key challenge noisy intermediatescale quantum ...


In [20]:
display_topics(lda_text_model, tfidf_text.get_feature_names_out())


Topic 00
  time (1.73)
  states (1.39)
  entanglement (1.38)
  state (0.96)
  effect (0.89)

Topic 01
  classical (1.23)
  approach (1.04)
  model (1.03)
  problem (0.96)
  using (0.80)

Topic 02
  graph (1.72)
  circuits (1.40)
  one (1.29)
  circuit (1.27)
  algorithms (1.26)

Topic 03
  states (3.11)
  entanglement (2.21)
  state (1.43)
  system (1.05)
  systems (0.95)

Topic 04
  key (2.18)
  rate (1.17)
  detection (1.12)
  method (1.08)
  protocol (1.06)

Topic 05
  qubits (1.43)
  qubit (1.15)
  systems (1.09)
  two (0.78)
  state (0.78)


### Visualizing LDA results

In [21]:
import pyLDAvis.lda_model

In [22]:
lda_display = pyLDAvis.lda_model.prepare(lda_text_model, count_text_vectors,
                                        count_text_vectorizer, sort_topics=False)

In [52]:
pyLDAvis.display(lda_display)

In [None]:
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [128]:
features = tfidf_text.get_feature_names_out()

def relevant_terms(lda_text_model, features):
    term_list = []
    for topic, words in enumerate(lda_text_model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        term_list.append(", ".join([features[largest[i]] for i in range(0, 5)]))

    return term_list

In [126]:
[", ".join([features[largest[i]] for i in range(0, 5)])]

['qubits, qubit, systems, two, state']

In [127]:
relevant_terms(lda_text_model, features)

(5,
 ['time, states, entanglement, state, effect',
  'classical, approach, model, problem, using',
  'graph, circuits, one, circuit, algorithms',
  'states, entanglement, state, system, systems',
  'key, rate, detection, method, protocol',
  'qubits, qubit, systems, two, state'])