In [1]:
!pip install contractions



In [2]:
import numpy as np
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import contractions
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
df = pd.read_parquet("/kaggle/input/dataset-with-embeddings/cs_papers_wo_embeddings.parquet")
df['categories'] = df['categories'].apply(lambda x: x.replace(', ', ''))
print(df.shape[0])

90583


In [17]:
df["abstract"].iloc[0]

'  In a quantum mechanical model, Diosi, Feldmann and Kosloff arrived at a conjecture stating that the limit of the entropy of certain mixtures is the relative entropy as system size goes to infinity. The conjecture is proven in this paper for density matrices. The first proof is analytic and uses the quantum law of large numbers. The second one clarifies the relation to channel capacity per unit cost for classical-quantum channels. Both proofs lead to generalization of the conjecture. '

In [18]:
def expand_contractions(sentence):
    contractions_expanded = [contractions.fix(word) for word in sentence.split()]
    return ' '.join(contractions_expanded)

def lower_case(sentence):
    return ' '.join([word.lower() for word in sentence.split()])
def remove_punctuation(sentence):
    return ' '.join([re.sub(r'[^\w\s]', '', word) for word in sentence.split()])

def preprocess(sentence):
    return lower_case(remove_punctuation(expand_contractions(sentence)))

In [19]:
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing abstracts"):
    df.at[idx, "abstract"] = preprocess(row["abstract"])

Processing abstracts:   0%|          | 0/90583 [00:00<?, ?it/s]

In [11]:
lda_weights=pd.read_parquet('/kaggle/input/dataset-with-embeddings/lda_weights.parquet')

In [7]:
tfidf=TfidfVectorizer()

In [22]:
tfidf_mat=tfidf.fit_transform(df[df["topics"]==12]["abstract"])

In [23]:
tfidf_mat_arr=tfidf_mat.toarray()

In [24]:
tfidf_avg=np.mean(tfidf_mat_arr,axis=1)

In [25]:
tfidf_sort=np.argsort(tfidf_avg)

In [26]:
#topic 12
words=tfidf.get_feature_names_out()
for i in tfidf_sort[:50]:
    print(words[i])

exterior
158q
dattani
factual
factory
anticodeoptimal
bipolarization
citecohl16
boil
exponentialfamily
constantsize
f_xy
benchmarked
3quasiregular
apexminorfree
1888n
2category
000003211504l
bigo1k
clauss
amazon
103110
23consistency
1131109128
distance2
bisimulationcoherent
2delta2
170log2n2
expk2epsilon
exponentiating
1877
aaronsons
deceivingly
champ
determinative
blocktests
158889
decorrelated
f_ijabcd
bcjrtrellis
31nquasicrosses
cccr
expleftomegaleftlrightright
a1cup2a
deltabit
absolutes
0q1
brahma
communityfinding
antiselfdual


In [89]:
# topic 10
words=tfidf.get_feature_names_out()
for i in tfidf_sort[:50]:
    print(words[i])

1209
18th
0791
netcs
25m
automl
railroad
blundell
campaigners
albert
additives
1240
recomputing
1306
albums
reclab
appearence
23
anomigan
rbp
knowledges
ndd
dau
036
distillate
banditrepair
discontinuousness
cybernetics
029
affecting
accusation
30s
luigovkeo60
gsns
interpolants
195k
girault
05365v2
insuring
0_3
crab
anthropomorphism
cnlvr
cryptology
mmff
intrinsically
colimits
dogma
agentbased
19nm


In [14]:
lda=LatentDirichletAllocation(n_components=20,max_iter=100,random_state=42,verbose=3)

In [15]:
tfidf_mat.shape

(90583, 189918)

In [16]:
lda_mat = lda.fit_transform(tfidf_mat)

iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100
iteration: 31 of max_iter: 100
iteration: 32 of max_iter: 100
iteration: 33 of 

In [17]:
lda_mat.shape

(90583, 20)

In [13]:
lda_mat_arg=lda_weights.to_numpy().argmax(axis=1)

In [14]:
from scipy import stats as st
st.mode(lda_mat_arg)

ModeResult(mode=10, count=49338)

In [15]:
for i in range(20):
    print(f"{i} {(lda_mat_arg == i).sum()}")

0 0
1 3
2 2
3 2
4 3
5 2
6 4
7 7
8 1
9 3
10 49338
11 1
12 32468
13 4
14 3
15 3
16 5
17 8731
18 3
19 0


In [20]:
df["topics"]=lda_mat_arg

In [21]:
df[df["topics"]==12]

Unnamed: 0,id,authors,title,categories,abstract,update_date,degree,num_citations,num_references,topics
0,0704.0046,"I. Csiszar, F. Hiai, D. Petz",A limit relation for entropy and channel capac...,"q, u, a, n, t, -, p, h, , c, s, ., I, T, , m...",in a quantum mechanical model diosi feldmann a...,2009-11-13,1,1,0,12
2,0704.0098,"Jack Raymond, David Saad",Sparsely-spread CDMA - a statistical mechanics...,"c, s, ., I, T, , m, a, t, h, ., I, T",sparse code division multiple access cdma a va...,2009-11-13,2,1,1,12
3,0704.0108,Sergey Gubin,Reducing SAT to 2-SAT,"c, s, ., C, C",description of a polynomial time reduction of ...,2007-05-23,5,1,4,12
4,0704.0213,Ketan D. Mulmuley Hariharan Narayanan,Geometric Complexity Theory V: On deciding non...,"c, s, ., C, C",this article has been withdrawn because it has...,2012-09-28,1,1,0,12
5,0704.0218,Yuri Pritykin,On Almost Periodicity Criteria for Morphic Seq...,"c, s, ., D, M, , c, s, ., L, O",in some particular cases we give criteria for ...,2007-05-23,2,0,2,12
...,...,...,...,...,...,...,...,...,...,...
90578,quant-ph/9909094,"E. Knill, R. Laflamme",Quantum Computation and Quadratically Signed W...,"q, u, a, n, t, -, p, h, , c, s, ., C, C",we prove that quantum computation is polynomia...,2007-05-23,4,4,0,12
90579,quant-ph/9910033,"Edith Hemaspaandra (RIT), Lane A. Hemaspaandra...",Almost-Everywhere Superiority for Quantum Comp...,"q, u, a, n, t, -, p, h, , c, s, ., C, C",simon as extended by brassard and hoyer shows ...,2007-05-23,2,2,1,12
90580,quant-ph/9910087,"Adrian Kent (DAMTP, University of Cambridge)",Unconditionally Secure Commitment of a Certifi...,"q, u, a, n, t, -, p, h, , c, s, ., C, R",in a secure bit commitment protocol involving ...,2009-10-31,1,0,1,12
90581,quant-ph/9911043,"Lucien Hardy (The Perimeter Institute), Adrian...",Cheat Sensitive Quantum Bit Commitment,"q, u, a, n, t, -, p, h, , c, s, ., C, R",we define cheat sensitive cryptographic protoc...,2009-10-31,5,4,1,12


In [45]:
tfidf.get_feature_names_out().shape

(189918,)

In [20]:
lda_df=pd.DataFrame(lda_mat)

In [23]:
lda_df.to_parquet("lda_weights.parquet",index=False)