In [1]:
# Installation of libraries

!pip install pandas          # Data handling
!pip install spacy           # NLP tasks
!pip install scikit-learn    # ML tools
!pip install nltk            # Text preprocessing
!pip install gensim          # Word embeddings (turn words into numbers so that similar words have similar vectors
!pip install transformers    # Pretrained models (T5)

!python -m spacy download xx_ent_wiki_sm  # Multilingual NER model

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
nilearn 0.11.1 req

In [2]:
#Load data from Kaggle and prepare the text corpus

import pandas as pd

# Load JSON dataset (change path if necessary)
json_path = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
df = pd.read_json(json_path, lines=True)

# Reduce to 100 rows for performance/testing purposes
df = df.head(100)

# Combine title and abstract into one text field for analysis
df['text'] = df['title'] + '. ' + df['abstract']

In [3]:
# Clean and preprocess the text (remove stopwords, lowercase, lemmatize)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define English stopwords
stop_words = set(stopwords.words('english'))

# Define a preprocessing function
def preprocess(text):
    # Lowercase and tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stopwords, keep only alphabetic tokens
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return ' '.join(tokens)

# Apply the preprocessing to the text column
df['clean_text'] = df['text'].apply(preprocess)



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Extract concepts using spaCy Named Entity Recognition (NER)

import spacy

# Load a multilingual pre-trained NER model
nlp = spacy.load("xx_ent_wiki_sm")

# Extract short entities (less than 5 words) as concepts
def extract_concepts(text):
    doc = nlp(text)
    return list(set(ent.text for ent in doc.ents if len(ent.text.split()) < 5))

# Apply the NER function to each row
df['concepts'] = df['text'].apply(extract_concepts)



In [5]:
# Extract key concepts using TF-IDF (Term Frequency - Inverse Document Frequency)

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a vectorizer that selects the top 1000 important words
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the cleaned text into TF-IDF features
X_tfidf = vectorizer.fit_transform(df['clean_text'])

# Retrieve the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

In [6]:
# Classify concepts into categories using a simple RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Create fake categories for demonstration purposes
categories = ['person', 'place', 'event']
df['category'] = [categories[i % 3] for i in range(len(df))]

# Encode the category labels into numeric values
le = LabelEncoder()
y = le.fit_transform(df['category'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a RandomForest model on the features
clf = RandomForestClassifier()
clf.fit(X_train, y_train)


In [7]:
# Generate questions from extracted concepts using a T5 model

from transformers import pipeline

# Load the T5 model for question generation
generator = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

# Define a function to generate a question based on a concept and its context
def generate_question(concept, context):
    # Highlight the concept in the context for the model
    marked = context.replace(concept, f"<hl>{concept}<hl>", 1)
    # Create the input prompt
    prompt = f"Generate a question: {marked}"
    # Generate the question
    result = generator(prompt, max_length=64, do_sample=False)
    return result[0]['generated_text']



2025-04-15 04:20:37.717574: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744690837.910838      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744690837.969588      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0


In [8]:
# Apply the question generator and print results

questions = []

# Loop through each row of the dataset
for i, row in df.iterrows():
    text = row['text']
    concepts = row['concepts']
    
    # Skip rows with no detected concepts
    if not concepts:
        continue

    # Use the first detected concept
    concept = concepts[0]
    
    try:
        # Generate a question based on the concept and text
        question = generate_question(concept, text)
        questions.append({
            "citation": text,
            "concept": concept,
            "question": question
        })

        # Print the result
        print(f"{i+1}. Citation:\n{text}\n")
        print(f"   Concept: {concept}")
        print(f"   Question: {question}\n{'-'*80}\n")

        # Stop after generating 100 questions
        if len(questions) >= 100:
            break
    except Exception as e:
        # Skip rows that cause errors
        print(f"Erreur à la ligne {i}: {e}")
        continue


1. Citation:
Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies.   A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that en

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


11. Citation:
Computing genus 2 Hilbert-Siegel modular forms over $\Q(\sqrt{5})$ via
  the Jacquet-Langlands correspondence.   In this paper we present an algorithm for computing Hecke eigensystems of
Hilbert-Siegel cusp forms over real quadratic fields of narrow class number
one. We give some illustrative examples using the quadratic field
$\Q(\sqrt{5})$. In those examples, we identify Hilbert-Siegel eigenforms that
are possible lifts from Hilbert eigenforms.


   Concept: Hilbert-Siegel
   Question: What modular forms are computed via the Jacquet-Langlands correspondence?
--------------------------------------------------------------------------------

12. Citation:
Distribution of integral Fourier Coefficients of a Modular Form of Half
  Integral Weight Modulo Primes.   Recently, Bruinier and Ono classified cusp forms $f(z) := \sum_{n=0}^{\infty}
a_f(n)q ^n \in S_{\lambda+1/2}(\Gamma_0(N),\chi)\cap \mathbb{Z}[[q]]$ that does
not satisfy a certain distribution property for modulo odd

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors


58. Citation:
Intelligent Life in Cosmology.   I shall present three arguments for the proposition that intelligent life is
very rare in the universe. First, I shall summarize the consensus opinion of
the founders of the Modern Synthesis (Simpson, Dobzhanski, and Mayr) that the
evolution of intelligent life is exceedingly improbable. Second, I shall
develop the Fermi Paradox: if they existed they'd be here. Third, I shall show
that if intelligent life were too common, it would use up all available
resources and die out. But I shall show that the quantum mechanical principle
of unitarity (actually a form of teleology!) requires intelligent life to
survive to the end of time. Finally, I shall argue that, if the universe is
indeed accelerating, then survival to the end of time requires that intelligent
life, though rare, to have evolved several times in the visible universe. I
shall argue that the acceleration is a consequence of the excess of matter over
antimatter in the universe. I sha

In [9]:
# Calculate Text Preprocessing Accuracy

def evaluate_preprocessing(sample_size=50):
    correct = 0
    for i, row in df.sample(sample_size).iterrows():
        original_text = row['text']
        processed_text = row['clean_text']
        
        # Checks if preprocessing removed stopwords, punctuation, and lowercase
        has_issues = (
            any(c.isupper() for c in processed_text) or          # Uppercase check
            any(c in string.punctuation for c in processed_text) or  # Punctuation check
            any(word in stop_words for word in processed_text.split())  # Stopword check
        )
        
        if not has_issues:
            correct += 1
    
    return correct / sample_size  # Returns % of correctly preprocessed texts

In [10]:
# Calculate Concept Extraction Accuracy

def evaluate_ner(sample_size=50):
    correct = 0
    for i, row in df.sample(sample_size).iterrows():
        extracted_concepts = row['concepts']
        
        # Checks if NER extracted at least one valid concept
        if len(extracted_concepts) > 0:  
            correct += 1
    
    return correct / sample_size  # Returns % of texts with valid extractions

In [11]:
# Calculate Classification Accuracy (RandomForest)

from sklearn.metrics import accuracy_score

# Predict on test set and compare with true labels
y_pred = clf.predict(X_test)  
classification_accuracy = accuracy_score(y_test, y_pred)  # Standard accuracy metric

In [12]:
# Calculate Question Generation Accuracy

def evaluate_question_generation(sample_size=20):
    good_questions = 0
    for i, question_data in enumerate(questions[:sample_size]):
        concept = question_data['concept']
        question = question_data['question']
        
        # Checks if question is relevant and grammatically valid
        if (
            concept.lower() in question.lower() and  # Concept appears in question
            question.endswith("?") and              # Ends with question mark
            len(question.split()) > 3               # Not too short
        ):
            good_questions += 1
    
    return good_questions / sample_size  # Returns % of valid questions

In [13]:
# 1. Run all evaluation functions first
preprocessing_accuracy = evaluate_preprocessing(sample_size=50)
ner_accuracy = evaluate_ner(sample_size=50)
classification_accuracy = accuracy_score(y_test, clf.predict(X_test))
qg_accuracy = evaluate_question_generation(sample_size=20)

# 2. Then compute weighted average
overall_accuracy = (
    0.2 * preprocessing_accuracy +
    0.3 * ner_accuracy +
    0.3 * classification_accuracy +
    0.2 * qg_accuracy
)

# 3. Print all results
print(f"Text Preprocessing Accuracy: {preprocessing_accuracy:.2%}")
print(f"NER Accuracy: {ner_accuracy:.2%}")
print(f"Classification Accuracy: {classification_accuracy:.2%}")
print(f"Question Generation Accuracy: {qg_accuracy:.2%}")
print(f"\nOverall Pipeline Accuracy: {overall_accuracy:.2%}")

Text Preprocessing Accuracy: 100.00%
NER Accuracy: 92.00%
Classification Accuracy: 30.00%
Question Generation Accuracy: 5.00%

Overall Pipeline Accuracy: 57.60%
