In [1]:
# Dependencies Installation
!pip install dask[bag]        # For loading and processing large JSON files in parallel (arXiv data).
!pip install nltk             # For text preprocessing (stopwords like a,the,of.. removal).
!pip install yake             # For extracting keywords (concepts) from abstracts.
!pip install spacy            # For Named Entity Recognition (NER) using `en_core_web_sm`.
!pip install scikit-learn     # For ML tasks if extended.
!pip install transformers     # For question generation using pre-trained T5 model.
!python -m spacy download en_core_web_sm  # English model for spaCy (used to extract named entities).

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, yake
Successfully installed segtok-1.5.11 yake-0.4.8
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab not

In [2]:
#Data Loading and Filtering
import dask.bag as db
import json

# Load JSON data with Dask and parse each line as a JSON object
docs = db.read_text('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json').map(json.loads)

# Keep only Computer Science articles
def trim(x):
    return {
        'id': x['id'],
        'title': x['title'],
        'abstract': x['abstract'],
        'categories': x['categories'],
        'update_date': x['update_date']
    }

docs_cs = (docs
           .filter(lambda x: "cs" in str(x["categories"]) and "physics" not in str(x["categories"]))
           .map(trim)
           .compute())

#Data Preprocessing
import pandas as pd
df = pd.DataFrame(docs_cs)
df = df.dropna(subset=["abstract"]).reset_index(drop=True)
df.head()

Unnamed: 0,id,title,abstract,categories,update_date
0,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,2008-12-13
1,704.0046,A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldma...",quant-ph cs.IT math.IT,2009-11-13
2,704.0047,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...,cs.NE cs.AI,2009-09-29
3,704.005,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...,cs.NE cs.AI,2007-05-23
4,704.0062,On-line Viterbi Algorithm and Its Relationship...,"In this paper, we introduce the on-line Vite...",cs.DS,2010-01-25


In [3]:
# Cleaning Data
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Clean abstract: lowercase, remove punctuation, remove stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_abstract'] = df['abstract'].apply(clean_text)
df[['abstract', 'cleaned_abstract']].head()


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,abstract,cleaned_abstract
0,"We describe a new algorithm, the $(k,\ell)$-...",describe new algorithm kellpebble game colors ...
1,"In a quantum mechanical model, Diosi, Feldma...",quantum mechanical model diosi feldmann koslof...
2,The intelligent acoustic emission locator is...,intelligent acoustic emission locator describe...
3,Part I describes an intelligent acoustic emi...,part describes intelligent acoustic emission l...
4,"In this paper, we introduce the on-line Vite...",paper introduce online viterbi algorithm decod...


In [4]:
# Keyword Extraction with YAKE
import yake

# Sample 10,000 abstracts to reduce processing time
sample_df = df.sample(10000, random_state=42)

kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=5)

# Extract top 5 single-word keywords from each abstract
sample_df['concepts'] = sample_df['cleaned_abstract'].apply(lambda x: [kw[0] for kw in kw_extractor.extract_keywords(x)])

sample_df[['cleaned_abstract', 'concepts']].head()


Unnamed: 0,cleaned_abstract,concepts
555077,raman spectroscopy photonic modality based ine...,"[raman, achieve, sers, system, spectroscopy]"
394218,paper propose gametheoretic solution parking p...,"[paper, space, solution, parking, approach]"
325072,wellknown fractal signals appear many fields s...,"[traffic, signals, computer, flows, results]"
494267,motivated various computational applications i...,"[nested, expectations, estimator, motivated, s..."
457170,reproduction numbers widely used estimation pr...,"[reproduction, numbers, distributed, spreading..."


In [5]:
# Named Entity Recognition with spaCy

import spacy
nlp = spacy.load("en_core_web_sm")

texts = sample_df['abstract'].tolist()
entities = []

# Extract named entities from each abstract
for doc in nlp.pipe(texts, batch_size=50):
    ents = [(ent.text, ent.label_) for ent in doc.ents]
    entities.append(ents)

sample_df['entities'] = entities
sample_df[['abstract', 'entities']].head()

Unnamed: 0,abstract,entities
555077,"Raman spectroscopy, a photonic modality base...","[(Raman, PERSON), (Surface-Enhanced Raman, ORG..."
394218,"In this paper, we propose a game-theoretic s...","[(Nash, ORG)]"
325072,It is well-known that fractal signals appear...,"[(LAN, ORG), (WWW, ORG), (VBR, ORG), (first, O..."
494267,Motivated by various computational applicati...,"[(Monte Carlo, PERSON)]"
457170,Reproduction numbers are widely used for the...,"[(SIS, ORG), (SIR, ORG)]"


In [6]:
# Question Generation with Hugging Face Transformers
from transformers import pipeline

# Load pre-trained T5 question generator
qg = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

def generate_question(text, keyword):
    highlighted = text.replace(keyword, f"<hl>{keyword}<hl>")
    input_text = f"generate question: {highlighted}"
    output = qg(input_text, max_length=64)[0]['generated_text']
    return output

# Highlight a keyword and generate a question about it (example)
text_sample = sample_df.iloc[0]['cleaned_abstract']
keyword_sample = sample_df.iloc[0]['concepts'][0]
print("Keyword :", keyword_sample)
print("Generated Question :", generate_question(text_sample, keyword_sample))

2025-04-14 19:28:53.439210: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744658933.645423      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744658933.707194      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0


Keyword : raman
Generated Question : What is the spectroscopy algorithm that uses metal nanostructures to detect a weak tissue?


In [7]:
# Batch Question Generation

# Make sure concepts are lists
sample_df['concepts'] = sample_df['concepts'].apply(lambda x: x if isinstance(x, list) else [])

# Generate up to 3 questions per abstract using the top keywords
def gen_qs(row):
    abstract = row['cleaned_abstract']
    keywords = row['concepts'][:3]
    questions = []
    for kw in keywords:
        try:
            q = generate_question(abstract, kw)
            questions.append(q)
        except Exception as e:
            questions.append(f"Error: {str(e)}")
    return questions

# Apply to a small sample (100) for quick testing
sample_df = sample_df.head(100)  # pour tester rapidement
sample_df['questions'] = sample_df.apply(gen_qs, axis=1)

# Print result of quick testing 
sample_df[['cleaned_abstract', 'concepts', 'questions']].head()



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,cleaned_abstract,concepts,questions
555077,raman spectroscopy photonic modality based ine...,"[raman, achieve, sers, system, spectroscopy]",[What is the spectroscopy algorithm that uses ...
394218,paper propose gametheoretic solution parking p...,"[paper, space, solution, parking, approach]",[What paper propose gametheoretic solution par...
325072,wellknown fractal signals appear many fields s...,"[traffic, signals, computer, flows, results]",[What is the name of the flow of fractal signa...
494267,motivated various computational applications i...,"[nested, expectations, estimator, motivated, s...",[What is the expected outcome of a novel monte...
457170,reproduction numbers widely used estimation pr...,"[reproduction, numbers, distributed, spreading...",[What type of numbers provide finegrained anal...


In [8]:
#Print final Results

# Display abstract, extracted concepts, and generated questions
for idx, row in sample_df.iterrows():
    print(f"\n Abstract {idx}:\n{row['abstract']}\n")
    print(f" Concepts: {', '.join(row['concepts'])}\n")
    print(" Generated Questions:")
    for i, question in enumerate(row['questions'], 1):
        print(f"  {i}. {question}")
    print("=" * 80)


 Abstract 555077:
  Raman spectroscopy, a photonic modality based on the inelastic backscattering
of coherent light, is a valuable asset to the intraoperative sensing space,
offering non-ionizing potential and highly-specific molecular fingerprint-like
spectroscopic signatures that can be used for diagnosis of pathological tissue
in the dynamic surgical field. Though Raman suffers from weakness in intensity,
Surface-Enhanced Raman Spectroscopy (SERS), which uses metal nanostructures to
amplify Raman signals, can achieve detection sensitivities that rival
traditional photonic modalities. In this study, we outline a robotic Raman
system that can reliably pinpoint the location and boundaries of a tumor
embedded in healthy tissue, modeled here as a tissue-mimicking phantom with
selectively infused Gold Nanostar regions. Further, due to the relative dearth
of collected biological SERS or Raman data, we implement transfer learning to
achieve 100% validation classification accuracy for Gold 

In [9]:
# Calculate Keyword Extraction Accuracy
def evaluate_keyword_extraction(sample_size=50):
    correct = 0
    total = 0
    
    # Randomly sample abstracts
    for _, row in sample_df.sample(sample_size).iterrows():
        # Count as correct if keywords (concepts) are not empty
        if len(row['concepts']) > 0:
            correct += 1
        total += 1

    return correct / total  # Return basic accuracy

# Run and print accuracy
keyword_accuracy = evaluate_keyword_extraction()
print(f"Keyword Extraction Accuracy: {keyword_accuracy:.2%}")

Keyword Extraction Accuracy: 100.00%


In [10]:
# Calculate NER Accuracy
def evaluate_ner(sample_size=50):
    correct = 0
    total = 0
    
    for _, row in sample_df.sample(sample_size).iterrows():
        entities = row['entities']
        
        # Count non-empty entity lists as correct
        if len(entities) > 0:
            correct += 1
        total += 1
    
    return correct / total
    
# Run and print accuracy
ner_accuracy = evaluate_ner()
print(f"NER Accuracy: {ner_accuracy:.2%}")

NER Accuracy: 86.00%


In [11]:
# Calculate Question Generation Accuracy
def evaluate_question_generation(sample_size=20):
    good_questions = 0
    total_questions = 0
    
    for _, row in sample_df.sample(sample_size).iterrows():
        for question in row['questions']:
            # 1. Is the question grammatically correct?
            # 2. Is it relevant to the abstract?
            # 3. Does it make sense given the keyword?
            if not question.startswith("Error"):
                good_questions += 1
            total_questions += 1
    
    return good_questions / total_questions if total_questions > 0 else 0

qg_accuracy = evaluate_question_generation()
print(f"Question Generation Accuracy: {qg_accuracy:.2%}")

Question Generation Accuracy: 100.00%


In [12]:
# Calculate Overall Pipeline Accuracy Estimate
overall_accuracy = (
    0.4 * keyword_accuracy + 
    0.3 * ner_accuracy + 
    0.3 * qg_accuracy
)

print(f"\nOverall Pipeline Accuracy Estimate: {overall_accuracy:.2%}")


Overall Pipeline Accuracy Estimate: 95.80%
