In [1]:
import requests
import time
import json
import pandas as pd
import csv
from collections import Counter
from datetime import datetime
import requests
import unicodedata

## Functions to define the search and respect rate limits (omitting by splitting search into FoS and year-by-year, month-by-mont)

In [2]:
BASE_URL = "https://api.semanticscholar.org/graph/v1"

def format_semantic_scholar_query(mandatory_keywords, optional_keywords):
    mandatory_query = ' '.join(f'"{keyword}"' for keyword in mandatory_keywords)
    optional_query = ' '.join(f'|"{keyword}"' for keyword in optional_keywords)
    return f"{mandatory_query} {optional_query}".strip()

def search_papers(query, year, field_of_study=None, limit=100, offset=0, month_start=None, month_end=None):
    print(f"Searching papers for year {year}, field of study {field_of_study}, offset: {offset}")
    url = f"{BASE_URL}/paper/search"
    
    # Format dates according to API syntax
    if month_start is not None:
        # Calculate last day of month
        if month_start in [4, 6, 9, 11]:
            last_day = 30
        elif month_start == 2:
            # Simplified leap year check
            last_day = 29 if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) else 28
        else:
            last_day = 31
            
        date_range = f"{year}-{month_start:02d}-01:{year}-{month_start:02d}-{last_day}"
        print(f"Using date range: {date_range}")
    
    params = {
        "query": query,
        "fields": "paperId,title,abstract,authors,year,citationCount,fieldsOfStudy,venue",
        "limit": limit,
        "offset": offset
    }
    
    if month_start is not None:
        params["publicationDateOrYear"]=date_range
    else:
        params["year"]=year
        
    if field_of_study:
        params["fieldsOfStudy"] = field_of_study
    print (params)
    try:
        response = requests.get(url, params=params, timeout=30)
        if response.status_code == 429:  # Rate limit hit
            retry_after = int(response.headers.get('Retry-After', 10))
            print(f"Rate limited. Waiting {retry_after} seconds...")
            time.sleep(retry_after)
            return search_papers(query, year, field_of_study, limit, offset, month_start,month_end) #calling the function to start again
            
        response.raise_for_status()
        data = response.json()
        papers = data.get("data", [])
        total = data.get("total", 0)
        print(f"Number of papers retrieved: {len(papers)}")
        return papers, total
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while searching papers: {e}")
        return [], 0
    
    
def iterate_fields_of_study(mandatory_keywords, optional_keywords, fields_of_study, start_year=1980, end_year=None):
    semantic_search_query_string = format_semantic_scholar_query(mandatory_keywords, optional_keywords)
    all_papers = []
    if end_year== None: end_year = datetime.now().year
    papers_since_last_save = 0
    SAVE_INTERVAL = 10
    
    # Try to load existing results In iterate_fields_of_study function
    try:
        with open("partial_results.json", 'r', encoding='utf-8') as f:
            all_papers = json.load(f)
        print(f"Loaded {len(all_papers)} existing results")
    except FileNotFoundError:
        print("Starting fresh search")
    except UnicodeDecodeError:
        print("Error reading existing file, starting fresh search")
        all_papers = []


    for year in range(start_year, end_year + 1):
        print(f"\nSearching for year {year}")
        for field in fields_of_study:
            print(f"Searching for field of study: {field}")
            for month_start in range(1,13):
                print(f'Processing month {month_start}')
                offset = 0
                total_retrieved = 0
                max_papers_per_field = 1000

                while total_retrieved < max_papers_per_field:
                    try:
                        papers, total = search_papers(
                            semantic_search_query_string, 
                            year, 
                            field,
                            limit=100,
                            offset=offset,
                            month_start=month_start,
                            month_end=month_start
                        )

                        if not papers:
                            break

                        # Process papers...
                        for paper in papers:
                            if not is_duplicate(paper, all_papers):
                                all_papers.append(paper)
                                papers_since_last_save += 1
                        
                        total_retrieved += len(papers)
                        offset += len(papers)

                        # Save periodically
                        if papers_since_last_save >= SAVE_INTERVAL:
                            save_results_to_file(all_papers)
                            papers_since_last_save = 0
                            print(f"Saved {len(all_papers)} total papers")

                        if offset >= total or len(papers) < 100:
                            break

                        time.sleep(3)

                    except Exception as e:
                        print(f"Error during search: {e}")
                        save_results_to_file(all_papers)
                        time.sleep(5)
                        break


    #save_results_to_file(all_papers)
    return all_papers

def save_results_to_file(results, filename="partial_results.json"):
    """Save current results to a JSON file with cleaning"""
    cleaned_results = [clean_paper_data(paper) for paper in results]
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(cleaned_results, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(results)} results to {filename}")

In [3]:
""" no longer used"""

def search_papers_with_pagination(query, year, field_of_study=None, offset=0, max_papers=10000, month_start=None, month_end=None):
    """Search papers with pagination support"""
    all_papers = []
    total_retrieved = 0
    
    while total_retrieved < max_papers:
        try:
            papers, total = search_papers(
                query, 
                year, 
                field_of_study, 
                month_start=month_start,
                month_end=month_end,
                limit=100, 
                offset=offset)
            if not papers:
                break
                
            all_papers.extend(papers)
            total_retrieved += len(papers)
            offset += len(papers)
            
            # Break if we've retrieved all available papers
            if offset >= total or len(papers) < 100:
                break
                
            # Respect rate limits
            time.sleep(3)
            
        except requests.exceptions.RequestException as e:
            print(f"Error during pagination: {e}")
            time.sleep(5)
            continue
            
    return all_papers, total  # Return total for better tracking

In [4]:
def process_results(all_papers):
    """Process and save final results with cleaning"""
    # Clean the data
    cleaned_papers = [clean_paper_data(paper) for paper in all_papers]
    
    # Convert to DataFrame
    df = pd.DataFrame(cleaned_papers)
    
    # Save main results
    df.to_csv("semantic_scholar_results.csv",
              index=False,
              sep=';',
              encoding='utf-8',
              quoting=csv.QUOTE_NONNUMERIC,
              escapechar='\\')
    
    # Process and save field frequencies
    all_fields = [field for fields in df['fieldsOfStudy'] 
                 for field in (fields if isinstance(fields, list) else [fields])]
    field_counts = Counter(all_fields)
    pd.DataFrame(field_counts.most_common(),
                columns=['Field of Study', 'Frequency']).to_csv(
                    "field_of_study_frequencies.csv",
                    index=False,
                    sep=';',
                    encoding='utf-8',
                    quoting=csv.QUOTE_NONNUMERIC)
    
    # Process and save venue frequencies
    venue_counts = Counter(df['venue'].dropna())
    pd.DataFrame(venue_counts.most_common(),
                columns=['Venue', 'Frequency']).to_csv(
                    "venue_frequencies.csv",
                    index=False,
                    sep=';',
                    encoding='utf-8',
                    quoting=csv.QUOTE_NONNUMERIC)
    
    return df, field_counts, venue_counts

### Some cleaning procedures

In [5]:
def clean_text_for_csv(text):
    if not isinstance(text, str):
        return text
    
    # Apply existing cleaning
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = ' '.join(text.split())
    text = text.replace('\u200b', '').replace('\ufeff', '')
    
    # Add new cleaning steps
    text = clean_ambiguous_unicode(text)
    text = normalize_unicode(text)
    
    return text
def clean_paper_data(paper):
    """Clean all text fields in a paper dictionary"""
    cleaned_paper = {}
    for key, value in paper.items():
        if isinstance(value, str):
            cleaned_paper[key] = clean_text_for_csv(value)
        elif isinstance(value, list):
            # Handle nested structures like authors and fieldsOfStudy
            cleaned_paper[key] = [
                clean_text_for_csv(item) if isinstance(item, str) 
                else item for item in value
            ]
        else:
            cleaned_paper[key] = value
    return cleaned_paper

def clean_ambiguous_unicode(text):
    if not isinstance(text, str):
        return text
        
    # Common ambiguous character replacements
    replacements = {
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2013': '-',  # En dash
        '\u2014': '-',  # Em dash
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def normalize_unicode(text):
    if not isinstance(text, str):
        return text
    # Normalize to NFKC form (compatible composition)
    return unicodedata.normalize('NFKC', text)

def is_duplicate(paper, existing_papers):
    """Check if paper is a duplicate based on paperId"""
    return any(paper['paperId'] == existing['paperId'] for existing in existing_papers)

def merge_paper_data(existing_paper, new_paper):
    """Merge paper data, keeping the most complete information"""
    merged = existing_paper.copy()
    for key, value in new_paper.items():
        # If the new paper has information that the existing one doesn't
        if key not in existing_paper or (not existing_paper[key] and value):
            merged[key] = value
    return merged

# Search keywords and 

In [6]:
# Define keywords and fields of study
mandatory_keywords = ['reliability', 'resilience', 'power systems', 'capacity utilization']#, 'graph', 'neural network']
optional_keywords = []
fields_of_study = ['Computer Science', 'Engineering', 'Physics', 'Mathematics', 'Business', 'Environmental Science']

# Call the function to iterate through fields of study and years
all_results = iterate_fields_of_study(mandatory_keywords, optional_keywords, fields_of_study)

# Process the results
combined_df, field_freq, venue_freq = process_results(all_results)

print(f"Total papers retrieved: {len(combined_df)}")
print("Results saved to CSV files.")

Starting fresh search

Searching for year 1980
Searching for field of study: Computer Science
Processing month 1
Searching papers for year 1980, field of study Computer Science, offset: 0
Using date range: 1980-01-01:1980-01-31
{'query': '"reliability" "resilience" "power systems" "capacity utilization"', 'fields': 'paperId,title,abstract,authors,year,citationCount,fieldsOfStudy,venue', 'limit': 100, 'offset': 0, 'publicationDateOrYear': '1980-01-01:1980-01-31', 'fieldsOfStudy': 'Computer Science'}
Rate limited. Waiting 10 seconds...
Searching papers for year 1980, field of study Computer Science, offset: 0
Using date range: 1980-01-01:1980-01-31
{'query': '"reliability" "resilience" "power systems" "capacity utilization"', 'fields': 'paperId,title,abstract,authors,year,citationCount,fieldsOfStudy,venue', 'limit': 100, 'offset': 0, 'publicationDateOrYear': '1980-01-01:1980-01-31', 'fieldsOfStudy': 'Computer Science'}
Number of papers retrieved: 0
Processing month 2
Searching papers for

KeyboardInterrupt: 

### When search is interrupted:

In [13]:
def convert_partial_results_to_csv():
    """Convert partial results from JSON to CSV with proper cleaning"""
    try:
        # Load partial results
        with open("partial_results.json", 'r', encoding='utf-8') as f:
            papers = json.load(f)
        
        if not papers:
            print("No papers found in partial results")
            return
            
        # Clean the data
        cleaned_papers = [clean_paper_data(paper) for paper in papers]
        
        # Convert to DataFrame
        df = pd.DataFrame(cleaned_papers)
        
        # Save main results with proper encoding and escaping
        df.to_csv("semantic_scholar_results.csv", 
                  index=False, 
                  sep=';',
                  encoding='utf-8',
                  quoting=csv.QUOTE_NONNUMERIC,
                  escapechar='\\')
        
        # Process field frequencies
        all_fields = [field for fields in df['fieldsOfStudy'] 
                     for field in (fields if isinstance(fields, list) else [fields])]
        field_counts = Counter(all_fields)
        
        # Save field frequencies
        pd.DataFrame(field_counts.most_common(), 
                    columns=['Field of Study', 'Frequency']).to_csv(
                        "field_of_study_frequencies.csv",
                        index=False,
                        sep=';',
                        encoding='utf-8',
                        quoting=csv.QUOTE_NONNUMERIC)
        
        # Process venue frequencies
        venue_counts = Counter(df['venue'].dropna())
        
        # Save venue frequencies
        pd.DataFrame(venue_counts.most_common(), 
                    columns=['Venue', 'Frequency']).to_csv(
                        "venue_frequencies.csv",
                        index=False,
                        sep=';',
                        encoding='utf-8',
                        quoting=csv.QUOTE_NONNUMERIC)
        
        print(f"Successfully converted {len(papers)} papers to CSV format")
        
    except Exception as e:
        print(f"Error converting partial results: {e}")
convert_partial_results_to_csv()

Successfully converted 18405 papers to CSV format


In [8]:
# Define keywords and fields of study
mandatory_keywords = ['reliability', 'resilience', 'power systems', 'capacity utilization', 'graph','neural network']
optional_keywords = []
fields_of_study = ['Computer Science', 'Engineering', 'Physics', 'Mathematics', 'Business', 'Environmental Science']
# Now call the function to iterate through fields of study and years

all_results = iterate_fields_of_study(mandatory_keywords, optional_keywords, fields_of_study)

# Process the results
combined_df, field_freq, venue_freq = process_results(all_results)

# Save results to CSV files
combined_df.to_csv("semantic_scholar_results.csv", index=False)
pd.DataFrame(field_freq.most_common(), columns=['Field of Study', 'Frequency']).to_csv("field_of_study_frequencies.csv", index=False)
pd.DataFrame(venue_freq.most_common(), columns=['Venue', 'Frequency']).to_csv("venue_frequencies.csv", index=False)

print(f"Total papers retrieved: {len(combined_df)}")
print("Results saved to CSV files.")


Searching for year 1980
Searching for field of study: Computer Science
Searching papers for year 1980, field of study Computer Science, with query: "reliability" "resilience" "power systems" "capacity utilization" "graph" "neural network", offset: 0
An error occurred while searching papers: 403 Client Error: Forbidden for url: https://api.semanticscholar.org/graph/v1/paper/search?query=%22reliability%22+%22resilience%22+%22power+systems%22+%22capacity+utilization%22+%22graph%22+%22neural+network%22&year=1980&fields=paperId%2Ctitle%2Cabstract%2Cauthors%2Cyear%2CcitationCount%2CfieldsOfStudy%2Cvenue&limit=50&offset=0&fieldsOfStudy=Computer+Science
Searching for field of study: Engineering
Searching papers for year 1980, field of study Engineering, with query: "reliability" "resilience" "power systems" "capacity utilization" "graph" "neural network", offset: 0
An error occurred while searching papers: 403 Client Error: Forbidden for url: https://api.semanticscholar.org/graph/v1/paper/sea

KeyboardInterrupt: 

Script started.
Search query: "reliability"+"resilience"+"power systems"+"capacity utilization"

Searching for year 2000
Searching papers for year 2000 with query: "reliability"+"resilience"+"power systems"+"capacity utilization", offset: 0
Number of papers retrieved for year 2000: 100
Searching papers for year 2000 with query: "reliability"+"resilience"+"power systems"+"capacity utilization", offset: 100
Number of papers retrieved for year 2000: 100
Searching papers for year 2000 with query: "reliability"+"resilience"+"power systems"+"capacity utilization", offset: 200
Number of papers retrieved for year 2000: 13
Total papers retrieved for year 2000: 213

Searching for year 2001
Searching papers for year 2001 with query: "reliability"+"resilience"+"power systems"+"capacity utilization", offset: 0
Number of papers retrieved for year 2001: 100
Searching papers for year 2001 with query: "reliability"+"resilience"+"power systems"+"capacity utilization", offset: 100
Number of papers retrie

In [14]:
all_fields = []
for fields in df['fieldsOfStudy']:
    if isinstance(fields, list):
        all_fields.extend(fields)
    elif isinstance(fields, str):
        all_fields.append(fields)
    
field_counts = Counter(all_fields)

print("\nField of Study Frequencies:")
for field, count in field_counts.most_common():
    print(f"{field}: {count}")


Field of Study Frequencies:
Computer Science: 884
Engineering: 359
Mathematics: 33
Physics: 13
Medicine: 13
Economics: 6
Business: 5
Environmental Science: 4
Materials Science: 2
Chemistry: 1
Psychology: 1


In [25]:

def count_venue_frequencies(df):
    if 'venue' in df.columns:
        # Count frequency of venues
        all_venues = df['venue'].dropna().tolist()  # Drop NA values from the venue column
        venue_counts = Counter(all_venues)
        
        # Create a DataFrame with venue frequencies
        venue_freq_df = pd.DataFrame(venue_counts.most_common(), columns=['Venue', 'Frequency'])
    else:
        print("Warning: 'venue' column not found in the DataFrame.")
        # Create an empty DataFrame if 'venue' is not present
        venue_freq_df = pd.DataFrame(columns=['Venue', 'Frequency'])
    
    return venue_freq_df

# Assuming 'df' is your existing DataFrame with the search results
# If it's not named 'df', replace 'df' with the actual name of your DataFrame

# Count venue frequencies
venue_freq_df = count_venue_frequencies(df)

if not venue_freq_df.empty:
    # Display the top 20 most frequent venues
    print("\nTop 20 Most Frequent Venues:")
    print(venue_freq_df.head(20))

    # Save venue frequencies to CSV
    venue_freq_df.to_csv("venue_frequencies.csv", index=False)
    print("Venue frequencies saved to venue_frequencies.csv")
else:
    print("No venue information available.")

# If you want to add the venue frequency to your existing field of study frequency DataFrame
try:
    field_freq_df = pd.read_csv("field_of_study_frequencies.csv")  # Load existing field frequencies
    if not venue_freq_df.empty:
        combined_freq_df = pd.concat([field_freq_df, venue_freq_df], axis=1)
    else:
        combined_freq_df = field_freq_df
    combined_freq_df.to_csv("combined_frequencies.csv", index=False)
    print("Combined frequencies saved to combined_frequencies.csv")
except FileNotFoundError:
    print("field_of_study_frequencies.csv not found. Skipping combined frequencies.")

No venue information available.
Combined frequencies saved to combined_frequencies.csv
