# Importing necessary packages

In [None]:
import pandas as pd
import json
from collections import Counter
import dask.bag as db
import string
import numpy as np
import json
import pyarrow as pa
import lancedb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tqdm import tqdm
from datetime import datetime
from sklearn.cluster import KMeans
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Mounting Drive to access the JSON dataset file

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
import dask.bag as db
import json
file_name = db.read_text('/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json').map(json.loads)

# Building a Stratified Sample of 100K records based on the category in the dataset

In [None]:
# Define the columns to extract
cols = ['id', 'title', 'abstract', 'categories', 'authors', 'comments', 'update_date']

# Initialize a Counter to store unique categories
category_counter = Counter()

# Load data and extract relevant fields + count categories
data = []
with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        categories = doc.get('categories', '').strip()
        if categories and ' ' not in categories:  # Include only rows with a single category
            category_counter.update([categories])
            data.append([
                doc.get('id'),
                doc.get('title', ''),
                doc.get('abstract', ''),
                categories,
                doc.get('authors', ''),
                doc.get('comments', ''),
                doc.get('update_date', '')
            ])

# Print unique categories
unique_categories = list(category_counter.keys())
print(f"Total unique categories: {len(unique_categories)}")
print("Unique categories:")
print(unique_categories)

# Optionally, print the top 20 most common categories with their counts
print("\nTop 20 categories by frequency:")
for category, count in category_counter.most_common(20):
    print(f"{category}: {count}")

# Convert data to DataFrame
df = pd.DataFrame(data, columns=cols)

# Clean the DataFrame
df['abstract'] = df['abstract'].str.strip().str.lower()
df['title'] = df['title'].str.strip().str.lower()
df['categories'] = df['categories'].str.strip()
df['authors'] = df['authors'].str.strip()
df['comments'] = df['comments'].str.strip()
df['update_date'] = pd.to_datetime(df['update_date'], errors='coerce')

# Drop rows with missing abstracts or titles
df = df.dropna(subset=['abstract', 'title'])

# Filter out categories with fewer than 10 samples
category_counts = df['categories'].value_counts()
valid_categories = category_counts[category_counts >= 50].index
print(f"\nTotal valid categories: {len(valid_categories)}")

# Filter the DataFrame to include only valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Stratified sampling based on categories
if len(df_filtered) < 100000:
    raise ValueError(f"Not enough data to sample 100,000 rows. Available: {len(df_filtered)}")

stratified_sample, _ = train_test_split(
    df_filtered,
    train_size=100000,
    stratify=df_filtered['categories'],
    random_state=62
)

# Reset index
stratified_sample = stratified_sample.reset_index(drop=True)

# Save the final sample to a CSV file
stratified_sample.to_csv('stratified_sample.csv', index=False)

# Display summary
print(f"\nFinal dataset size: {len(stratified_sample)}")
print("Category distribution in the sample:")
print(stratified_sample['categories'].value_counts())


Total unique categories: 149
Unique categories:
['hep-ph', 'physics.gen-ph', 'math.CO', 'cond-mat.mes-hall', 'gr-qc', 'cond-mat.mtrl-sci', 'astro-ph', 'math.NT', 'hep-th', 'hep-ex', 'math.NA', 'nlin.PS', 'math.RA', 'cond-mat.str-el', 'physics.pop-ph', 'nucl-th', 'math.FA', 'cs.DS', 'math.DS', 'physics.soc-ph', 'math.AG', 'math.OA', 'math.PR', 'math.DG', 'physics.optics', 'math.GR', 'nlin.SI', 'math.SG', 'physics.data-an', 'cs.CC', 'math.GT', 'quant-ph', 'cond-mat.other', 'math.CV', 'math.AP', 'cond-mat.supr-con', 'math.RT', 'cond-mat.stat-mech', 'q-bio.OT', 'physics.plasm-ph', 'nlin.CG', 'nucl-ex', 'cond-mat.soft', 'physics.comp-ph', 'math.MG', 'math.QA', 'physics.bio-ph', 'physics.chem-ph', 'math.AT', 'physics.geo-ph', 'q-bio.BM', 'math.OC', 'cs.CR', 'physics.class-ph', 'q-bio.PE', 'q-bio.NC', 'physics.atom-ph', 'math.GM', 'hep-lat', 'math.CA', 'physics.atm-clus', 'cs.PF', 'physics.acc-ph', 'math.SP', 'nlin.CD', 'physics.hist-ph', 'physics.flu-dyn', 'cond-mat.dis-nn', 'cs.CV', 'cs.LG'

# Filtering the sample further to exclude categories with unique counts less than 20 to maintain a stratified distribution throughout

In [None]:
# Load your DataFrame
df = pd.read_csv('stratified_sample.csv')

# Get category counts
category_counts = df['categories'].value_counts()

# Identify categories with 20 or more occurrences
valid_categories = category_counts[category_counts >= 20].index

# Filter DataFrame to keep only rows with valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Display the number of rows before and after filtering
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_filtered)}")

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_stratified_sample.csv', index=False)

# Display summary
print("Categories with fewer than 20 records have been removed.")
print("Updated category distribution:")
print(df_filtered['categories'].value_counts())


Original dataset size: 100000
Filtered dataset size: 99942
Categories with fewer than 20 records have been removed.
Updated category distribution:
astro-ph    7184
hep-ph      6495
quant-ph    5203
hep-th      4697
cs.CV       3533
            ... 
q-fin.PM      27
q-fin.TR      27
cs.MS         26
q-bio.CB      26
cs.OS         22
Name: categories, Length: 145, dtype: int64


# Splitting the sample into Train(70K), Validation(15K), Test(15K)

In [20]:
df_filtered.head()

Unnamed: 0,id,title,abstract,categories,authors,comments,update_date
0,astro-ph/0607362,the virial balance of clumps and cores in molecular clouds,"we study the instantaneous virial balance of clumps and cores (ccs) in 3d\nsimulations of driven, mhd, isothermal molecular clouds (mcs). the models\nrepresent a range of magnetic field strengths in mcs from subcritical to\nnon-magnetic regimes. we identify ccs at different density thresholds, and for\neach object, we calculate all the terms that enter the eulerian form of the\nvirial theorem (evt). a cc is considered gravitationally bound when the\ngravitational term in the evt is larger than the amount for the system to be\nvirialized, which is more stringent than the condition that it be large enough\nto make the total volume energy negative. we also calculate, quantities\ncommonly used in the observations to indicate the state of gravitational\nboundedness of ccs such as the jeans number j_c, the mass-to magnetic flux\nratio mu_c, and the virial parameter alpha_vir. our results show that: a) ccs\nare dynamical out-of-equilibrium structures. b) the surface energies are of the\nsame order than their volume counterparts c) ccs are either in the process of\nbeing compressed or dispersed by the velocity field. yet, not all ccs that have\na compressive net kinetic energy are gravitationally bound. d) there is no\n1-to-1 correspondence between the state of gravitational boundedness of a cc as\ndescribed by the virial analysis or as implied by the classical indicators. in\ngeneral, in the virial analysis, we observe that only the inner regions of the\nobjects are gravitationally bound, whereas j_c, alpha_vir, and mu_c estimates\ntend to show that they are more gravitationally bound at the lowest threshold\nlevels and more magnetically supercritical. g) we observe, in the non-magnetic\nsimulation, the existence of a bound core with structural and dynamical\nproperties that resemble those of the bok globule barnard 68 (b68).",astro-ph,"Sami Dib (1,2), Jongsoo Kim (2), Enrique Vazquez-Semadeni (1), Andreas\n Burkert (3), Mohsen Shadmehri (4,5) ((1) CRyA-UNAM, (2) KASI, (3) USM, (4)\n DCU, (5) Ferdowsi Univ.)","Accepted to ApJ. Discussion substantially enlarged, a few corrections\n and additional figures. Main conclusions unchanged",2011-02-11
1,2108.03495,game theory and machine learning in uavs-assisted wireless communication\n networks: a survey,"in recent years, unmanned aerial vehicles (uavs) have been used in fields\nsuch as architecture, business delivery, military and civilian theaters, and\nmany others. with increased applications comes the increased demand for\nadvanced algorithms for resource allocation and energy management. as is well\nknown, game theory and machine learning are two powerful tools already widely\nused in the wireless communication field and there are numerous surveys of game\ntheory and machine learning usage in wireless communication. existing surveys\nhowever focus either on game theory or machine learning and due to this fact,\nthe current article surveys both game-theoretic and machine learning algorithms\nfor use by uavs in wireless communication networks (u-wcns). we also discuss\nhow to combine game theory and machine learning for solving problems in u-wcns\nand identify several future research directions.",cs.MA,"M. Zhou, Y. Guan, M. Hayajneh, K. Niu, and C. Abdallah",,2021-08-10
2,1607.08427,gyroscope precession along bound equatorial plane orbits around a kerr\n black hole,the precession of a test gyroscope along stable bound equatorial plane orbits\naround a kerr black hole is analyzed and the precession angular velocity of the\ngyro's parallel transported spin vector and the increment in precession angle\nafter one orbital period is evaluated. the parallel transported marck frame\nwhich enters this discussion is shown to have an elegant geometrical\nexplanation in terms of the electric and magnetic parts of the killing-yano\n2-form and a wigner rotation effect.,gr-qc,"Donato Bini, Andrea Geralico, Robert T. Jantzen",16 pages; revtex macros; 3 eps figures,2016-09-28
3,1405.0627,a stochastic approach for quantifying immigrant integration: the spanish\n test case,"we apply stochastic process theory to the analysis of immigrant integration.\nusing a unique and detailed data set from spain, we study the relationship\nbetween local immigrant density and two social and two economic immigration\nquantifiers for the period 1999-2010. as opposed to the classic time-series\napproach, by letting immigrant density play the role of ""time"", and the\nquantifier the role of ""space"" it become possible to analyze the behavior of\nthe quantifiers by means of continuous time random walks. two classes of\nresults are obtained. first we show that social integration quantifiers evolve\nfollowing pure diffusion law, while the evolution of economic quantifiers\nexhibit ballistic dynamics. second we make predictions of best and worst case\nscenarios taking into account large local fluctuations. our stochastic process\napproach to integration lends itself to interesting forecasting scenarios\nwhich, in the hands of policy makers, have the potential to improve political\nresponses to integration problems. for instance, estimating the standard\nfirst-passage time and maximum-span walk reveals local differences in\nintegration performance for different immigration scenarios. thus, by\nrecognizing the importance of local fluctuations around national means, this\nresearch constitutes an important tool to assess the impact of immigration\nphenomena on municipal budgets and to set up solid multi-ethnic plans at the\nmunicipal level as immigration pressure build.",physics.soc-ph,"Elena Agliari, Adriano Barra, Pierluigi Contucci, Rickard Sandell,\n Cecilia Vernia",,2016-02-02
4,2201.08949,temporal aggregation for adaptive rgbt tracking,"visual object tracking with rgb and thermal infrared (tir) spectra available,\nshorted in rgbt tracking, is a novel and challenging research topic which draws\nincreasing attention nowadays. in this paper, we propose an rgbt tracker which\ntakes spatio-temporal clues into account for robust appearance model learning,\nand simultaneously, constructs an adaptive fusion sub-network for cross-modal\ninteractions. unlike most existing rgbt trackers that implement object tracking\ntasks with only spatial information included, temporal information is further\nconsidered in this method. specifically, different from traditional siamese\ntrackers, which only obtain one search image during the process of picking up\ntemplate-search image pairs, an extra search sample adjacent to the original\none is selected to predict the temporal transformation, resulting in improved\nrobustness of tracking performance.as for multi-modal tracking, constrained to\nthe limited rgbt datasets, the adaptive fusion sub-network is appended to our\nmethod at the decision level to reflect the complementary characteristics\ncontained in two modalities. to design a thermal infrared assisted rgb tracker,\nthe outputs of the classification head from the tir modality are taken into\nconsideration before the residual connection from the rgb modality. extensive\nexperimental results on three challenging datasets, i.e. vot-rgbt2019, gtot and\nrgbt210, verify the effectiveness of our method. code will be shared at\n\textcolor{blue}{\emph{https://github.com/zhangyong-tang/taat}}.",cs.CV,"Zhangyong Tang, Tianyang Xu, and Xiao-Jun Wu","12 pages, 10 figures",2022-02-01


In [None]:
# Load the stratified sample
df = df_filtered

# Split into train (70k) and remaining (30k)
train_df, remaining_df = train_test_split(
    df,
    train_size=70000,
    stratify=df['categories'],
    random_state=42
)

# Split remaining into validation (15k) and test (15k)
val_df, test_df = train_test_split(
    remaining_df,
    test_size=0.5,
    stratify=remaining_df['categories'],
    random_state=42
)

# Save the splits to CSV files
train_df.to_csv('train_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

# Display summary
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 70000
Validation set size: 14971
Test set size: 14971


# Preprocessing data by performing necessary cleaning operations (Lowercasing, Lemmatizing, Removing punctuations, whitespace, special characters)

In [None]:


# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('train_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning to relevant fields
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_authors'] = df['authors'].apply(clean_text)
df['cleaned_categories'] = df['categories'].apply(clean_text)
df['cleaned_abstract'] = df['abstract'].apply(clean_text)
df['cleaned_comments'] = df['comments'].apply(clean_text)

# Create the enhanced text field and remove newlines
df['enhanced_text'] = df.apply(lambda row: f"""
Title: {row['cleaned_title']} [SEP]
Authors: {row['cleaned_authors']} [SEP]
Categories: {row['cleaned_categories']} [SEP]
Abstract: {row['cleaned_abstract']} [SEP]
Comments: {row['cleaned_comments']} [SEP]
Updated on: {row['update_date']}
""".replace('\n', ' ').strip(), axis=1)

pd.set_option('display.max_colwidth', None)

# Display the first few rows to verify the enhanced text field
print(df[['id', 'enhanced_text']].head())

# Save the updated DataFrame to a new CSV file
df.to_csv('enhanced_stratified_sample_train.csv', index=False)

print("Enhanced text field created and saved successfully.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prern\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prern\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


           id  \
0  1606.03119   
1   1306.5681   
2   1208.4287   
3   1006.0121   
4  2111.05140   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [19]:
df.head()

Unnamed: 0,id,title,abstract,categories,authors,comments,update_date,cleaned_title,cleaned_authors,cleaned_categories,cleaned_abstract,cleaned_comments,enhanced_text
0,1606.03119,derivations and centroids of four dimensional associative algebras,"in this paper, we focus on derivations and centroids of four dimensional\nassociative algebras. using an existing classification result of low\ndimensional associative algebras, we describe the derivations and centroids of\nfour dimensional associative algebras. we also identify algebra(s) that belong\nto the characteristically nilpotent class among the algebras of four\ndimensional associative algebras.",math.RA,"A.O. Abdulkareem, M.A. Fiidow and I.S. Rakhimov","20 pages, 2 tables, Accepted in International Journal of Pure and\n Applied Mathematics",2017-02-21,derivation centroid four dimensional associative algebra,ao abdulkareem fiidow rakhimov,mathra,paper focus derivation centroid four dimensional associative algebra using existing classification result low dimensional associative algebra describe derivation centroid four dimensional associative algebra also identify algebra belong characteristically nilpotent class among algebra four dimensional associative algebra,20 page 2 table accepted international journal pure applied mathematics,Title: derivation centroid four dimensional associative algebra [SEP] Authors: ao abdulkareem fiidow rakhimov [SEP] Categories: mathra [SEP] Abstract: paper focus derivation centroid four dimensional associative algebra using existing classification result low dimensional associative algebra describe derivation centroid four dimensional associative algebra also identify algebra belong characteristically nilpotent class among algebra four dimensional associative algebra [SEP] Comments: 20 page 2 table accepted international journal pure applied mathematics [SEP] Updated on: 2017-02-21
1,1306.5681,coupled-cluster studies of infinite nuclear matter,"the aim of this work is to develop the relevant formalism for performing\ncoupled-cluster (cc) calculations in nuclear matter and neutron star matter,\nincluding thereby important correlations to infinite order in the interaction\nand testing modern nuclear forces based on chiral effective field theory. our\nformalism includes the exact treatment of the so-called pauli operator in a\npartial wave expansion of the equation of state. nuclear and neutron matter\ncalculations are done using a coupled particle-particle and hole-hole ladder\napproximation. the coupled ladder equations are derived as an approximation of\ncc theory, leaving out particle-hole and non-linear diagrams from the cc\ndoubles amplitude equation. this study is a first step toward cc calculations\nfor nuclear and neutron matter. we present results for both symmetric nuclear\nmatter and pure neutron matter employing state-of-the-art nucleon-nucleon\ninteractions based on chiral effective field theory. we employ also the newly\noptimized chiral interaction [a. ekstr\""om et al., phys. rev. lett. 110, 192502\n(2013)] to study infinite nuclear matter. the ladder approximation method and\ncorresponding results are compared with conventional brueckner-hartree-fock\ntheory.",nucl-th,"G. Baardsen, A. Ekstr\""om, G. Hagen and M. Hjorth-Jensen","18 pages, 14 figures",2013-11-18,coupledcluster study infinite nuclear matter,g baardsen ekstrom g hagen hjorthjensen,nuclth,aim work develop relevant formalism performing coupledcluster cc calculation nuclear matter neutron star matter including thereby important correlation infinite order interaction testing modern nuclear force based chiral effective field theory formalism includes exact treatment socalled pauli operator partial wave expansion equation state nuclear neutron matter calculation done using coupled particleparticle holehole ladder approximation coupled ladder equation derived approximation cc theory leaving particlehole nonlinear diagram cc double amplitude equation study first step toward cc calculation nuclear neutron matter present result symmetric nuclear matter pure neutron matter employing stateoftheart nucleonnucleon interaction based chiral effective field theory employ also newly optimized chiral interaction ekstrom et al phys rev lett 110 192502 2013 study infinite nuclear matter ladder approximation method corresponding result compared conventional bruecknerhartreefock theory,18 page 14 figure,Title: coupledcluster study infinite nuclear matter [SEP] Authors: g baardsen ekstrom g hagen hjorthjensen [SEP] Categories: nuclth [SEP] Abstract: aim work develop relevant formalism performing coupledcluster cc calculation nuclear matter neutron star matter including thereby important correlation infinite order interaction testing modern nuclear force based chiral effective field theory formalism includes exact treatment socalled pauli operator partial wave expansion equation state nuclear neutron matter calculation done using coupled particleparticle holehole ladder approximation coupled ladder equation derived approximation cc theory leaving particlehole nonlinear diagram cc double amplitude equation study first step toward cc calculation nuclear neutron matter present result symmetric nuclear matter pure neutron matter employing stateoftheart nucleonnucleon interaction based chiral effective field theory employ also newly optimized chiral interaction ekstrom et al phys rev lett 110 192502 2013 study infinite nuclear matter ladder approximation method corresponding result compared conventional bruecknerhartreefock theory [SEP] Comments: 18 page 14 figure [SEP] Updated on: 2013-11-18
2,1208.4287,regularity and uniqueness of the heat ow of biharmonic maps,"in this paper, we first establish regularity of the heat flow of biharmonic\nmaps into the unit sphere $s^l\subset\mathbb r^{l+1}$ under a smallness\ncondition of renormalized total energy. for the class of such solutions to the\nheat flow of biharmonic maps, we prove the properties of uniqueness, convexity\nof hessian energy, and unique limit at time infinity. we establish both\nregularity and uniqueness for the class of weak solutions $u$ to the heat flow\nof biharmonic maps into any compact riemannian manifold $n$ without boundary\nsuch that $\nabla^2 u\in l^q_tl^p_x$ for some $p>n/2$ and $q>2$ satisfying\n(1.13).",math.AP,"Jay Hineman, Tao Huang, Changyou Wang","Two errors in the proof of proposition 2.2 have been fixed, as a\n consequence the range of the power $p$ through the main theorems of the paper\n is required to $p>3/2$",2012-09-25,regularity uniqueness heat ow biharmonic map,jay hineman tao huang changyou wang,mathap,paper first establish regularity heat flow biharmonic map unit sphere slsubsetmathbb rl1 smallness condition renormalized total energy class solution heat flow biharmonic map prove property uniqueness convexity hessian energy unique limit time infinity establish regularity uniqueness class weak solution u heat flow biharmonic map compact riemannian manifold n without boundary nabla2 uin lqtlpx pn2 q2 satisfying 113,two error proof proposition 22 fixed consequence range power p main theorem paper required p32,Title: regularity uniqueness heat ow biharmonic map [SEP] Authors: jay hineman tao huang changyou wang [SEP] Categories: mathap [SEP] Abstract: paper first establish regularity heat flow biharmonic map unit sphere slsubsetmathbb rl1 smallness condition renormalized total energy class solution heat flow biharmonic map prove property uniqueness convexity hessian energy unique limit time infinity establish regularity uniqueness class weak solution u heat flow biharmonic map compact riemannian manifold n without boundary nabla2 uin lqtlpx pn2 q2 satisfying 113 [SEP] Comments: two error proof proposition 22 fixed consequence range power p main theorem paper required p32 [SEP] Updated on: 2012-09-25
3,1006.0121,small-scale behaviour in deterministic reaction models,"in a recent paper published in this journal [j. phys. a: math. theor. 42\n(2009) 495004] we studied a one-dimensional particles system where nearest\nparticles attract with a force inversely proportional to a power \alpha of\ntheir distance and coalesce upon encounter. numerics yielded a distribution\nfunction h(z) for the gap between neighbouring particles, with\nh(z)=z^{\beta(\alpha)} for small z and \beta(\alpha)>\alpha. we can now prove\nanalytically that in the strict limit of z\to 0, \beta=\alpha for \alpha>0,\ncorresponding to the mean-field result, and we compute the length scale where\nmean-field breaks down. more generally, in that same limit correlations are\nnegligible for any similar reaction model where attractive forces diverge with\nvanishing distance. the actual meaning of the measured exponent \beta(\alpha)\nremains an open question.",cond-mat.stat-mech,Paolo Politi and Daniel ben-Avraham,Six pages. Section 2 has been rewritten. Accepted for publication in\n Journal of Physics A: Mathematical and Theoretical,2010-09-07,smallscale behaviour deterministic reaction model,paolo politi daniel benavraham,condmatstatmech,recent paper published journal j phys math theor 42 2009 495004 studied onedimensional particle system nearest particle attract force inversely proportional power alpha distance coalesce upon encounter numerics yielded distribution function hz gap neighbouring particle hzzbetaalpha small z betaalphaalpha prove analytically strict limit zto 0 betaalpha alpha0 corresponding meanfield result compute length scale meanfield break generally limit correlation negligible similar reaction model attractive force diverge vanishing distance actual meaning measured exponent betaalpha remains open question,six page section 2 rewritten accepted publication journal physic mathematical theoretical,Title: smallscale behaviour deterministic reaction model [SEP] Authors: paolo politi daniel benavraham [SEP] Categories: condmatstatmech [SEP] Abstract: recent paper published journal j phys math theor 42 2009 495004 studied onedimensional particle system nearest particle attract force inversely proportional power alpha distance coalesce upon encounter numerics yielded distribution function hz gap neighbouring particle hzzbetaalpha small z betaalphaalpha prove analytically strict limit zto 0 betaalpha alpha0 corresponding meanfield result compute length scale meanfield break generally limit correlation negligible similar reaction model attractive force diverge vanishing distance actual meaning measured exponent betaalpha remains open question [SEP] Comments: six page section 2 rewritten accepted publication journal physic mathematical theoretical [SEP] Updated on: 2010-09-07
4,2111.0514,two-dimensional phonon polaritons in multilayers of hexagonal boron\n nitride from a macroscopic phonon model,"phonon polaritons (phps) in freestanding and supported multilayers (muls) of\nhexagonal boron nitride (hbn) are systematically studied using a macroscopic\noptical-phonon model. the php properties such as confinement, group velocity,\npropagation quality factor (pqf) and wavelength scaling are studied. owing to\nthe nonlocal high-frequency screening, there is an upper frequency limit making\nthe two-dimensional (2d) phps have a frequency band, and also a maximum pqf\noccurs near the centre frequency. the substrate's dielectric response should be\nincluded to accurately calculate the php properties. while the simple\nelectrostatic approximation (esa) is a proper treatment for php frequencies\n$\omega$ above $\omega_0$ (e.g. $\omega>1.03\omega_0$ for the 30-layers;\n$\omega_0$ is the $\gamma$ point optical phonon frequency), it fails to\ndescribe the php properties near $\omega_0$ and the effect of retardation\nshould be included for an accurate description. the php wavelength versus the\nlayer thickness near $\omega_0$ deviates significantly from a linear scaling\nlaw given by the esa due to strong phonon-photon coupling. the calculated php\ndispersion, pqf and scaling are compared with experimental data of a number of\nspectroscopic studies and good agreement is obtained. while the frequency of\nincident light should be near the centre frequency to maximize the pqf, the php\nwavelength, confinement and propagation length can be engineered by varying the\nmul thickness and its dielectric environment.",cond-mat.mes-hall,Jian-zhong Zhang,18 pages and 9 figures,2022-03-14,twodimensional phonon polaritons multilayers hexagonal boron nitride macroscopic phonon model,jianzhong zhang,condmatmeshall,phonon polaritons phps freestanding supported multilayers muls hexagonal boron nitride hbn systematically studied using macroscopic opticalphonon model php property confinement group velocity propagation quality factor pqf wavelength scaling studied owing nonlocal highfrequency screening upper frequency limit making twodimensional 2d phps frequency band also maximum pqf occurs near centre frequency substrate dielectric response included accurately calculate php property simple electrostatic approximation esa proper treatment php frequency omega omega0 eg omega103omega0 30layers omega0 gamma point optical phonon frequency fails describe php property near omega0 effect retardation included accurate description php wavelength versus layer thickness near omega0 deviate significantly linear scaling law given esa due strong phononphoton coupling calculated php dispersion pqf scaling compared experimental data number spectroscopic study good agreement obtained frequency incident light near centre frequency maximize pqf php wavelength confinement propagation length engineered varying mul thickness dielectric environment,18 page 9 figure,Title: twodimensional phonon polaritons multilayers hexagonal boron nitride macroscopic phonon model [SEP] Authors: jianzhong zhang [SEP] Categories: condmatmeshall [SEP] Abstract: phonon polaritons phps freestanding supported multilayers muls hexagonal boron nitride hbn systematically studied using macroscopic opticalphonon model php property confinement group velocity propagation quality factor pqf wavelength scaling studied owing nonlocal highfrequency screening upper frequency limit making twodimensional 2d phps frequency band also maximum pqf occurs near centre frequency substrate dielectric response included accurately calculate php property simple electrostatic approximation esa proper treatment php frequency omega omega0 eg omega103omega0 30layers omega0 gamma point optical phonon frequency fails describe php property near omega0 effect retardation included accurate description php wavelength versus layer thickness near omega0 deviate significantly linear scaling law given esa due strong phononphoton coupling calculated php dispersion pqf scaling compared experimental data number spectroscopic study good agreement obtained frequency incident light near centre frequency maximize pqf php wavelength confinement propagation length engineered varying mul thickness dielectric environment [SEP] Comments: 18 page 9 figure [SEP] Updated on: 2022-03-14


In [None]:
train_df = pd.read_csv('enhanced_stratified_sample_train.csv')
val_df = pd.read_csv('val_df.csv')
test_df = pd.read_csv('test_df.csv')

# Creating TF-IDF Vectors

In [None]:

tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf_train_matrix = tfidf_vectorizer.fit_transform(train_df['enhanced_text'])
# tfidf_val_matrix = tfidf_vectorizer.transform(val_df['abstract_cleaned'])
# tfidf_test_matrix = tfidf_vectorizer.transform(test_df['abstract_cleaned'])

In [7]:
# Recommendation Function
def recommend_tfidf(query_abstract, top_k=5):
    # Transform query into TF-IDF vector
    query_vector = tfidf_vectorizer.transform([query_abstract])

    # Compute cosine similarities with train set
    similarities = cosine_similarity(query_vector, tfidf_train_matrix).flatten()

    # Get top-k indices
    top_indices = similarities.argsort()[-top_k:][::-1]
    top_scores = similarities[top_indices]
    # Return top-k rows from the train_df
    return train_df.iloc[top_indices],top_scores

In [8]:
def precision_at_k(recommendations, true_category, k):
    relevant = sum(1 for category in recommendations["categories"].tolist() if category == true_category)
    return relevant / k
def recall_at_k(recommendations, true_category, all_relevant_count, k):
    relevant = sum(1 for category in recommendations["categories"].tolist() if category == true_category)
    return relevant / all_relevant_count if all_relevant_count > 0 else 0
def mean_reciprocal_rank(recommendations, true_category):
    for i, category in enumerate(recommendations["categories"].tolist()):
        if category == true_category:
            return 1 / (i + 1)  # Rank is 1-based
    return 0  # No relevant document found


In [None]:

# Define custom schema with vector as FixedSizeList
vector_dim = 10000  # Replace with the actual dimension of your TF-IDF vectors
custom_schema = pa.schema([
    pa.field("id", pa.int32()),
    pa.field("vector", pa.list_(pa.float32(), vector_dim)),  # FixedSizeList for vectors
    pa.field("title", pa.string()),
    pa.field("categories", pa.string()),
    pa.field("abstract", pa.string()),
    pa.field("authors",pa.string()),
    pa.field("comments",pa.string()),
    pa.field("update_date",pa.string()),
    pa.field("enhanced_text",pa.string())
])


# Creating the schema for storing the TF-IDF Vectors in LanceDB

In [None]:
# Assume tfidf_train_matrix and train_df are defined
tfidf_vectors = tfidf_train_matrix.toarray().astype('float32')

# Define the batch size
batch_size = 500  # Adjust based on your system's memory capacity

# Connect to LanceDB
db = lancedb.connect("lancedb_directory")

# Create or open the table in LanceDB
tbl = db.create_table("tfidf_vectors", schema=custom_schema, mode="overwrite")

# Prepare for batch insertion
batch_data = []

# Convert train_df to a list of records for efficient iteration
train_records = train_df.to_dict(orient="records")

for idx, (vector, row) in enumerate(zip(tfidf_vectors, train_records)):
    record = {
        "id": idx,
        "vector": vector.tolist(),
        "title": row["title"],
        "categories": row["categories"],
        "abstract": row["abstract"],
        "authors": row["authors"],
        "comments": row["comments"],
        "update_date": row["update_date"],
        "enhanced_text": row["enhanced_text"]
    }
    batch_data.append(record)

    # Insert batch when the specified size is reached
    if len(batch_data) == batch_size:
        batch_df = pd.DataFrame(batch_data)
        tbl.add(batch_df)
        batch_data = []  # Clear the batch_data list

# Insert any remaining data after the loop
if batch_data:
    batch_df = pd.DataFrame(batch_data)
    tbl.add(batch_df)


# Cleaning & preprocessing the test data

In [11]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('test_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['cleaned_abstract'] = df['abstract'].apply(clean_text)


test_df = df


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prern\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prern\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [12]:
tfidf_test_matrix = tfidf_vectorizer.transform(test_df['cleaned_abstract'])

In [13]:
# from tqdm import tqdm  # For progress tracking

# # Precompute category counts in the training data
# category_counts = train_df["categories"].value_counts().to_dict()

# # Initialize metrics
# total_precision = 0
# total_recall = 0
# total_mrr = 0
# num_queries = len(test_df)

# # Process queries in batches
# batch_size = 500  # Adjust based on available memory
# for start in tqdm(range(0, num_queries, batch_size), desc="Processing Batches"):
#     end = min(start + batch_size, num_queries)

#     # Extract batch vectors and true categories
#     batch_vectors = tfidf_test_matrix[start:end].toarray()
#     batch_categories = test_df.iloc[start:end]["categories"].values

#     for idx, (query_vector, true_category) in enumerate(zip(batch_vectors, batch_categories)):
#         # Generate recommendations
#         recommendations = tbl.search(query_vector).metric("cosine").limit(5).to_pandas()  # Efficient LanceDB search

#         # Compute metrics
#         precision = precision_at_k(recommendations, true_category, k=5)
#         all_relevant_count = category_counts.get(true_category, 0)
#         recall = recall_at_k(recommendations, true_category, all_relevant_count, k=5)
#         mrr = mean_reciprocal_rank(recommendations, true_category)

#         total_precision += precision
#         total_recall += recall
#         total_mrr += mrr

#         # Log details for the current query
#         print(f"Processed Query {start + idx + 1}/{num_queries}:")
#         print(f" - True Category: {true_category}")
#         print(f" - Recommendations: {recommendations[['categories', 'title']]}")
#         print(f" - Precision@5: {precision:.2f}, Recall@5: {recall:.2f}, MRR: {mrr:.2f}")

# # Average metrics
# avg_precision = total_precision / num_queries
# avg_recall = total_recall / num_queries
# avg_mrr = total_mrr / num_queries

# print("\nFinal Metrics:")
# print(f"Average Precision@5: {avg_precision:.2f}")
# print(f"Average Recall@5: {avg_recall:.2f}")
# print(f"Average MRR: {avg_mrr:.2f}")


# Batch Testing with all the ground truth measures

In [None]:
# Precompute category counts in the training data
category_counts = train_df["categories"].value_counts().to_dict()

# Perform clustering on train embeddings (for clustering ground truth)
num_clusters = 20  # Adjust as needed
train_embeddings = tfidf_train_matrix.toarray()  # Or use SBERT embeddings
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
train_df['cluster'] = kmeans.fit_predict(train_embeddings)

# Initialize metrics for each ground truth method
metrics = {
    "category": {"precision": 0, "recall": 0, "mrr": 0},
    "clustering": {"precision": 0, "recall": 0, "mrr": 0},
    "similarity": {"precision": 0, "recall": 0, "mrr": 0},
    "temporal": {"precision": 0, "recall": 0, "mrr": 0},
}

# Define similarity threshold and temporal parameters
similarity_threshold = 0.7
time_window_days = 365  # 1-year window for temporal evaluation
weights = {"category": 0.3, "cluster": 0.2, "similarity": 0.3, "temporal": 0.2}

# Number of queries
num_queries = len(test_df)

# Functions for different ground truth evaluations
def precision_at_k(recommendations, true_label, k=5):
    relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
    return relevant / k

def recall_at_k(recommendations, true_label, all_relevant_count, k=5):
    relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
    return relevant / all_relevant_count if all_relevant_count > 0 else 0

def mean_reciprocal_rank(recommendations, true_label):
    for i, label in enumerate(recommendations["categories"].tolist()):
        if label == true_label:
            return 1 / (i + 1)
    return 0

def temporal_score(query_date, rec_date, window=time_window_days):
    rec_date = pd.to_datetime(rec_date, errors='coerce')
    return 1 if pd.notnull(rec_date) and abs((query_date - rec_date).days) <= window else 0

def process_query(query_vector, true_category, query_date):
    result = {
        "category": {"precision": 0, "recall": 0, "mrr": 0},
        "clustering": {"precision": 0, "recall": 0, "mrr": 0},
        "similarity": {"precision": 0, "recall": 0, "mrr": 0},
        "temporal": {"precision": 0, "recall": 0, "mrr": 0},
    }
    
    # Generate recommendations
    recommendations = tbl.search(query_vector).metric("cosine").limit(5).to_pandas()
    recommendation_vectors = np.vstack(recommendations['vector'].tolist())

    # Category-Based Evaluation
    all_relevant_count = category_counts.get(true_category, 0)
    result["category"]["precision"] = precision_at_k(recommendations, true_category, k=5)
    result["category"]["recall"] = recall_at_k(recommendations, true_category, all_relevant_count, k=5)
    result["category"]["mrr"] = mean_reciprocal_rank(recommendations, true_category)

    # Clustering-Based Evaluation
    true_cluster = train_df[train_df["categories"] == true_category]["cluster"].iloc[0]
    result["clustering"]["precision"] = precision_at_k(recommendations, true_cluster, k=5)
    result["clustering"]["recall"] = recall_at_k(recommendations, true_cluster, all_relevant_count, k=5)
    result["clustering"]["mrr"] = mean_reciprocal_rank(recommendations, true_cluster)

    # Similarity-Based Evaluation
    cosine_similarities = cosine_similarity(query_vector.reshape(1, -1), recommendation_vectors)[0]
    relevant_similar = sum(1 for score in cosine_similarities[:5] if score >= similarity_threshold)
    result["similarity"]["precision"] = relevant_similar / 5
    result["similarity"]["recall"] = relevant_similar / all_relevant_count if all_relevant_count > 0 else 0
    result["similarity"]["mrr"] = mean_reciprocal_rank(recommendations, true_category)

    # Temporal-Based Evaluation
    relevant_temporal = sum(1 for rec_date in recommendations["update_date"][:5] if temporal_score(query_date, rec_date))
    result["temporal"]["precision"] = relevant_temporal / 5
    result["temporal"]["recall"] = relevant_temporal / all_relevant_count if all_relevant_count > 0 else 0
    result["temporal"]["mrr"] = mean_reciprocal_rank(recommendations, true_category)

    return result

# Process queries in batches with parallel processing
batch_size = 500
for start in tqdm(range(0, num_queries, batch_size), desc="Processing Batches"):
    end = min(start + batch_size, num_queries)

    # Extract batch vectors and true categories
    batch_vectors = tfidf_test_matrix[start:end].toarray()
    batch_categories = test_df.iloc[start:end]["categories"].values
    batch_dates = pd.to_datetime(test_df.iloc[start:end]["update_date"], errors='coerce').values

    # Parallel processing of each query in the batch
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_query, query_vector, true_category, query_date)
                   for query_vector, true_category, query_date in zip(batch_vectors, batch_categories, batch_dates)]
        
        for future in as_completed(futures):
            result = future.result()
            for key in metrics:
                for metric in metrics[key]:
                    metrics[key][metric] += result[key][metric]

# Compute average metrics
for method in metrics:
    metrics[method]["precision"] /= num_queries
    metrics[method]["recall"] /= num_queries
    metrics[method]["mrr"] /= num_queries

# Display final metrics
print("\nFinal Metrics:")
for method, scores in metrics.items():
    print(f"\n{method.capitalize()} Ground Truth:")
    print(f" - Average Precision@5: {scores['precision']:.2f}")
    print(f" - Average Recall@5: {scores['recall']:.2f}")
    print(f" - Average MRR: {scores['mrr']:.2f}")


Processing Batches: 100%|██████████| 30/30 [2:01:40<00:00, 243.36s/it]


Final Metrics:

Category Ground Truth:
 - Average Precision@5: 0.44
 - Average Recall@5: 0.00
 - Average MRR: 0.60

Clustering Ground Truth:
 - Average Precision@5: 0.00
 - Average Recall@5: 0.00
 - Average MRR: 0.00

Similarity Ground Truth:
 - Average Precision@5: 0.00
 - Average Recall@5: 0.00
 - Average MRR: 0.60

Temporal Ground Truth:
 - Average Precision@5: 0.18
 - Average Recall@5: 0.00
 - Average MRR: 0.60





In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Function to get recommendations for a manual abstract input
def get_recommendations(manual_abstract, top_k=5):
    # Convert the manual abstract to a TF-IDF vector
    query_vector = tfidf_vectorizer.transform([manual_abstract])

    # Compute cosine similarity between the query vector and the training embeddings
    similarities = cosine_similarity(query_vector, tfidf_train_matrix)[0]

    # Get the indices of the top_k most similar entries
    top_indices = similarities.argsort()[::-1][:top_k]

    # Retrieve the recommended titles, abstracts, and similarity scores
    recommendations = train_df.iloc[top_indices][['title', 'cleaned_abstract','cleaned_categories']]
    recommendations['similarity_score'] = similarities[top_indices]

    return recommendations

# Manually provide an abstract
manual_abstract = """zeroshot quantization zsq promising compressing accelerating deep neural networks data training fullprecision models inaccessible zsq network quantization performed using synthetic samples performance quantized models depends heavily quality synthetic samples nonetheless synthetic samples constructed existing zsq methods easily fitted models accordingly quantized models obtained methods suffer significant performance degradation hard samples address issue propose hard sample synthesizing training hast specifically hast pays attention hard samples synthesizing samples makes synthetic samples hard fit training quantized models hast aligns features extracted fullprecision quantized models ensure similarity features extracted models extensive experiments hast significantly outperforms existing zsq methods achieving performance comparable models quantized real data"""

# Get the top 5 recommendations
top_recommendations = get_recommendations(manual_abstract, top_k=5)

# Display the recommendations
print("\nTop Recommendations:")
for idx, row in top_recommendations.iterrows():
    print(f"\nTitle: {row['title']}")
    print(f"Category: {row['cleaned_categories']}")
    print(f"Similarity Score: {row['similarity_score']:.4f}")


Top Recommendations:

Title: diversifying sample generation for accurate data-free quantization
Category: cscv
Similarity Score: 0.3478

Title: quantized w-algebra of ${\frak sl}(2,1)$ : a construction from the
  quantization of screening operators
Category: mathqa
Similarity Score: 0.2876

Title: training deep networks with synthetic data: bridging the reality gap by
  domain randomization
Category: cscv
Similarity Score: 0.2839

Title: splitting of a doubly quantized vortex through intertwining in
  bose-einstein condensates
Category: condmatsoft
Similarity Score: 0.2675

Title: towards semi-supervised learning of automatic post-editing:
  data-synthesis by infilling mask with erroneous tokens
Category: cscl
Similarity Score: 0.2671
