In [2]:
import requests
import pandas as pd
import feedparser
import time
import re
from urllib.parse import quote

def clean_text(text):
    """Clean text by removing extra whitespace and special characters"""
    if not text:
        return ""
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text.strip())
    # Remove special characters that might cause CSV issues
    text = re.sub(r'[^\w\s\.,;:!?()-]', '', text)
    return text

def fetch_arxiv_papers(category_query, max_results=250):
    """Fetch papers from ArXiv for a specific category"""
    base_url = "http://export.arxiv.org/api/query?"
    
    papers = []
    start = 0
    batch_size = 50  # ArXiv limits to 50 per request
    
    while len(papers) < max_results:
        query = f"search_query={quote(category_query)}&start={start}&max_results={batch_size}"
        url = base_url + query
        
        print(f"Fetching papers {start+1}-{start+batch_size} for category: {category_query}")
        
        try:
            response = feedparser.parse(url)
            
            if not response.entries:
                print(f"No more papers found for {category_query}")
                break
            
            for entry in response.entries:
                if len(papers) >= max_results:
                    break
                    
                title = clean_text(entry.title)
                abstract = clean_text(entry.summary)
                
                if title and abstract:  # Only add if both title and abstract exist
                    papers.append({
                        'title': title,
                        'abstract': abstract,
                        'text': f"{title} {abstract}"
                    })
            
            start += batch_size
            time.sleep(1)  # Be respectful to the API
            
        except Exception as e:
            print(f"Error fetching papers: {e}")
            break
    
    return papers[:max_results]

def create_research_dataset():
    """Create the complete research papers dataset"""
    
    # Define categories and their search queries
    categories = {
        'Technology': [
            'cat:cs.AI',  # Artificial Intelligence
            'cat:cs.LG',  # Machine Learning
            'cat:cs.CV',  # Computer Vision
            'cat:cs.NE'   # Neural Networks
        ],
        'Healthcare': [
            'cat:q-bio.QM',  # Quantitative Methods in Biology
            'cat:physics.med-ph',  # Medical Physics
            'all:medical AND all:health',
            'all:clinical AND all:treatment'
        ],
        'Finance': [
            'cat:q-fin.GN',  # General Finance
            'cat:q-fin.CP',  # Computational Finance
            'all:financial AND all:market',
            'all:economics AND all:investment'
        ],
        'Education': [
            'all:education AND all:learning',
            'all:educational AND all:technology',
            'all:teaching AND all:pedagogy',
            'all:curriculum AND all:assessment'
        ],
        'Environment': [
            'cat:physics.ao-ph',  # Atmospheric and Oceanic Physics
            'all:climate AND all:change',
            'all:environmental AND all:sustainability',
            'all:renewable AND all:energy'
        ]
    }
    
    all_papers = []
    
    for category, queries in categories.items():
        print(f"\n{'='*50}")
        print(f"Collecting papers for: {category}")
        print(f"{'='*50}")
        
        category_papers = []
        papers_per_query = 250 // len(queries)  # Distribute evenly across queries
        
        for query in queries:
            papers = fetch_arxiv_papers(query, papers_per_query)
            for paper in papers:
                paper['category'] = category
            category_papers.extend(papers)
            
            if len(category_papers) >= 250:  # Aim for ~250 papers per category
                break
        
        print(f"Collected {len(category_papers)} papers for {category}")
        all_papers.extend(category_papers[:250])  # Limit to 250 per category
    
    # Create DataFrame
    df = pd.DataFrame(all_papers)
    
    # Remove duplicates based on title
    df = df.drop_duplicates(subset=['title'], keep='first')
    
    # Shuffle the data
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"\n{'='*50}")
    print("DATASET CREATION SUMMARY")
    print(f"{'='*50}")
    print(f"Total papers collected: {len(df)}")
    print("\nCategory distribution:")
    print(df['category'].value_counts())
    
    return df

# Create the dataset
if __name__ == "__main__":
    print("Starting dataset creation...")
    print("This may take 10-15 minutes due to API rate limits...")
    
    dataset = create_research_dataset()
    
    # Save to CSV
    dataset.to_csv('research_papers_dataset.csv', index=False)
    print(f"\nDataset saved as 'research_papers_dataset.csv'")
    
    # Show sample data
    print("\nSample data:")
    print(dataset.head(2).to_string())

Starting dataset creation...
This may take 10-15 minutes due to API rate limits...

Collecting papers for: Technology
Fetching papers 1-50 for category: cat:cs.AI
Fetching papers 51-100 for category: cat:cs.AI
Fetching papers 1-50 for category: cat:cs.LG
Fetching papers 51-100 for category: cat:cs.LG
Fetching papers 1-50 for category: cat:cs.CV
Fetching papers 51-100 for category: cat:cs.CV
Fetching papers 1-50 for category: cat:cs.NE
Fetching papers 51-100 for category: cat:cs.NE
Collected 248 papers for Technology

Collecting papers for: Healthcare
Fetching papers 1-50 for category: cat:q-bio.QM
Fetching papers 51-100 for category: cat:q-bio.QM
Fetching papers 1-50 for category: cat:physics.med-ph
Fetching papers 51-100 for category: cat:physics.med-ph
Fetching papers 1-50 for category: all:medical AND all:health
Fetching papers 51-100 for category: all:medical AND all:health
Fetching papers 1-50 for category: all:clinical AND all:treatment
Fetching papers 51-100 for category: all:cl