# 📚 ResearchBot - Model Preparation Notebook
This notebook loads the arXiv metadata, filters computer science papers, and prepares the dataset.

In [2]:
import json
import csv

# Input and output files
input_file = r'C:/Users/rosaa/OneDrive/Documents/nullclass_intern_task3/archive/arxiv-metadata-oai-snapshot.json'
output_file = 'filtered_cs_papers.csv'

# Categories to keep
target_prefix = 'cs.'

# Open input and output
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    # Write CSV headers
    writer.writerow(['id', 'title', 'abstract', 'categories'])

    count = 0
    for line in infile:
        try:
            paper = json.loads(line)
            categories = paper.get('categories', '')
            if target_prefix in categories:
                writer.writerow([
                    paper.get('id', ''),
                    paper.get('title', '').strip(),
                    paper.get('abstract', '').strip(),
                    categories
                ])
                count += 1
            if count >= 5000:
                break  # Optional: stop after 5000 papers
        except json.JSONDecodeError:
            continue

print(f"✅ Done! Saved {count} computer science papers to '{output_file}'.")


✅ Done! Saved 5000 computer science papers to 'filtered_cs_papers.csv'.


In [1]:
# Load and preview filtered CSV
import pandas as pd

df = pd.read_csv("filtered_cs_papers.csv")
df = df.dropna(subset=["title", "abstract"])
df["content"] = df["title"] + ". " + df["abstract"]

df.head()


Unnamed: 0,id,title,abstract,categories,content
0,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,Sparsity-certifying Graph Decompositions. We d...
1,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is describe...,physics.gen-ph,The evolution of the Earth-Moon system based o...
2,704.0021,Molecular Synchronization Waves in Arrays of A...,Spatiotemporal pattern formation in a product-...,nlin.PS physics.chem-ph q-bio.MN,Molecular Synchronization Waves in Arrays of A...
3,704.0033,Convergence of the discrete dipole approximati...,We performed a rigorous theoretical convergenc...,physics.optics physics.comp-ph,Convergence of the discrete dipole approximati...
4,704.0035,Convergence of the discrete dipole approximati...,We propose an extrapolation technique that all...,physics.optics physics.comp-ph,Convergence of the discrete dipole approximati...
