In [1]:
# Dask for parallel data processing
!pip install dask[bag]

# NLP toolkit
!pip install nltk

# Keyword extraction
!pip install yake

# Advanced NLP library
!pip install spacy

# Machine learning tools
!pip install scikit-learn

# Pretrained models (BERT, T5, etc.)
!pip install transformers

# Download English model for spaCy
!python -m spacy download en_core_web_sm

# Evaluation metrics and data handling
!pip install rouge-score bert-score pandas pyarrow

# Extra scoring and sentence similarity
!pip install bert-score sentence-transformers


Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, yake
Successfully installed segtok-1.5.11 yake-0.4.8
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab not

In [2]:
import dask.bag as db  # For parallel text processing
import json  # To parse JSON lines
import pandas as pd  # For data manipulation
import yake  # For keyword extraction
from transformers import pipeline  # For summarization and question generation
from tqdm import tqdm  # For progress bars

# Load JSON file (one line per document)
docs = db.read_text('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json').map(json.loads)

# Take a sample of 100 documents
docs_sampled = docs.take(100)
df = pd.DataFrame(docs_sampled)

# Keep only title and abstract columns
df = df[['title', 'abstract']].dropna()

# Enable progress bars for pandas
tqdm.pandas()

# Load summarization pipeline (T5 model)
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")

# Function to summarize text
def summarize_text(text):
    input_text = "summarize: " + text.strip().replace("\n", " ")
    input_len = len(input_text.split())
    max_len = min(80, max(20, input_len // 2))
    min_len = max(20, max_len // 2)
    try:
        summary = summarizer(input_text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
    except:
        summary = text
    return summary

# Apply summarization to abstracts
df['cleaned_abstract'] = df['abstract'].progress_apply(summarize_text)

# Initialize keyword extractor
kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=5)

# Extract keywords from text
def extract_keywords(text):
    try:
        return [kw[0] for kw in kw_extractor.extract_keywords(text)]
    except:
        return []

# Apply keyword extraction
df['concepts'] = df['cleaned_abstract'].progress_apply(extract_keywords)

# Load question generation model
qg = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

# Generate question for a keyword in text
def generate_question(text, keyword):
    if keyword not in text:
        return ""
    try:
        highlighted = text.replace(keyword, f"<hl>{keyword}<hl>", 1)
        input_text = f"generate question: {highlighted}"
        return qg(input_text, max_length=64, do_sample=False)[0]['generated_text']
    except:
        return ""

# Generate questions for top 3 keywords
def gen_qs(row):
    abstract = row['cleaned_abstract']
    keywords = row['concepts'][:3]
    return [generate_question(abstract, kw) for kw in keywords if kw]

# Apply question generation
df['questions'] = df.progress_apply(gen_qs, axis=1)

# Display first 3 examples
def display_row(row):
    print(" Abstract (shortened):")
    print(row['cleaned_abstract'])
    print("\n Concepts:", row['concepts'])
    print("\n Questions:")
    for i, q in enumerate(row['questions'], 1):
        print(f"  {i}. {q}")
    print("\n" + "="*80 + "\n")

for i in range(min(3, len(df))):
    display_row(df.iloc[i])

2025-04-15 04:27:35.646334: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744691255.877909      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744691255.940430      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0
 11%|█         | 11/100 [00:06<00:54,  1.62it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:54<00:00,  1.85it/s]
100%|██████████| 100/100 [00:00<00:00, 391.33it/s]


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0
100%|██████████| 100/100 [00:40<00:00,  2.44it/s]

 Abstract (shortened):
a fully differential calculation is presented for the production of massive photon pairs at hadron colliders . all next-to-leading order perturbative contributions are included . the region of phase space is specified in which the calculation is most reliable . predictions are made for more detailed tests with CDF and DO data 

 Concepts: ['calculation', 'CDF', 'colliders', 'order', 'included']

 Questions:
  1. What is presented for the production of massive photon pairs at hadron colliders?
  2. What data does DO have?
  3. What is the production of massive photon pairs at?


 Abstract (shortened):
a new algorithm, the $(k,ell)$-pebble game with colors, is used to obtain a characterization of the family of graphs and algorithmic solutions to a family of problems . special instances of sparse graphs have received

 Concepts: ['k,ell', 'family', 'graphs', 'algorithm', 'pebble']

 Questions:
  1. What is the name of the $-pebble game?
  2. What is the k,ell $-pebb




In [3]:
from rouge_score import rouge_scorer

# Evaluate the summarization using ROUGE-L score
def evaluate_summarization(sample_size=10):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    # Use original abstract as reference, cleaned summary as generated text
    for _, row in df.head(sample_size).iterrows():
        reference = row['abstract']
        generated = row['cleaned_abstract']
        scores = scorer.score(reference, generated)
        rouge_scores.append(scores['rougeL'].fmeasure)  # ROUGE-L F1 score

    return sum(rouge_scores) / len(rouge_scores)  # Return average score

In [4]:
def evaluate_keyword_extraction(sample_size=10):
    # Mock "true" keywords (replace with human-annotated labels if available)
    # Here, we simulate ground truth by taking the first 3 words from the abstract as "true" keywords
    df['true_keywords'] = df['abstract'].apply(
        lambda x: list(set(x.lower().split()[:3]))  # Simulated ground truth
    )

    precisions = []
    for _, row in df.head(sample_size).iterrows():
        true_keywords = row['true_keywords']
        predicted_keywords = row['concepts']
        correct = len(set(predicted_keywords) & set(true_keywords))
        precision = correct / len(predicted_keywords) if predicted_keywords else 0
        precisions.append(precision)

    return sum(precisions) / len(precisions)

In [5]:
def evaluate_keyword_extraction(sample_size=10):
    # Simulate ground truth: first 3 words of the abstract as "true" keywords
    df['true_keywords'] = df['abstract'].apply(
        lambda x: list(set(x.lower().split()[:3]))
    )

    precisions = []
    for _, row in df.head(sample_size).iterrows():
        true_keywords = row['true_keywords']
        predicted_keywords = row['concepts']
        correct = len(set(predicted_keywords) & set(true_keywords))  # True positives
        precision = correct / len(predicted_keywords) if predicted_keywords else 0
        precisions.append(precision)

    return sum(precisions) / len(precisions)  # Average precision

In [6]:
def evaluate_qg_automated(sample_size=10):
    # Initialize counters for total questions and valid (accurate) questions
    total = 0
    valid = 0

    # Loop through the first `sample_size` rows of the DataFrame
    for _, row in df.head(sample_size).iterrows():
        # Convert the cleaned abstract to lowercase for case-insensitive matching
        abstract = row['cleaned_abstract'].lower()
        # Get the list of extracted keywords for the current row
        keywords = row['concepts']
        # Get the list of generated questions for the current row
        questions = row['questions']

        # Loop through each generated question
        for q in questions:
            # If the question contains any of the extracted keywords, count it as valid
            if any(kw.lower() in q.lower() for kw in keywords):
                valid += 1
            # Increment the total number of questions
            total += 1

    # Return the proportion of valid questions, avoid division by zero
    return valid / total if total > 0 else 0

In [7]:
# 1. Run evaluations
summarization_accuracy = evaluate_summarization(sample_size=10)
keyword_accuracy = evaluate_keyword_extraction(sample_size=10)
qg_accuracy = evaluate_qg_automated(sample_size=10)  # Small sample due to manual input

# 2. Weighted average (tune weights if needed)
overall_accuracy = (
    0.4 * summarization_accuracy +  # Summarization is important
    0.3 * keyword_accuracy +        # Keywords influence question generation
    0.3 * qg_accuracy               # QG is partly subjective
)

# 3. Print results
print(f"\nSummarization Accuracy (ROUGE-L F1): {summarization_accuracy:.2%}")
print(f"Keyword Extraction Accuracy (Precision@5): {keyword_accuracy:.2%}")
print(f"Question Generation Accuracy: {qg_accuracy:.2%}")
print(f"\nOverall Pipeline Accuracy: {overall_accuracy:.2%}")


Summarization Accuracy (ROUGE-L F1): 41.60%
Keyword Extraction Accuracy (Precision@5): 4.00%
Question Generation Accuracy: 86.67%

Overall Pipeline Accuracy: 43.84%
