In [8]:
import requests
import pandas as pd

In [9]:
# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)
subjects = subjects_list
rows = 100

In [10]:
# Define the years you want to pull data for
years_to_pull = [2017, 2018, 2019, 2020, 2021, 2022, 2023]  

def get_articles_by_subject(subject, year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'query.bibliographic': subject
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])

            if not items:
                break

            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

# Initialize an empty list to store DataFrames for each subject
subject_dataframes = []

In [11]:
# Loop through each subject and fetch data
for subject in subjects_list:
    try:
        all_articles = []
        for year in years_to_pull:
            articles = get_articles_by_subject(subject, year, rows=rows)
            all_articles.extend(articles)

            data = {
                'DOI': [],
                'Title': [],
                'Container Title': [],
                'Publisher': [],
                'Publish Date': [],
                'Author First Name': [],
                'Author Last Name': [],
                'Author Order': [],
                'Referenced By': [],
                'Year': []  # Added this line for the Year column
            }

            for article in all_articles:
                doi = article.get('DOI', '')
                title = article.get('title', [''])[0]
                container_title = article.get('container-title', [''])[0]
                publisher = article.get('publisher', '')
                publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
                authors = article.get('author', [])
                referenced_by = article.get('is-referenced-by-count', 0)

                for order, author in enumerate(authors, start=1):
                    first_name = author.get('given', '')
                    last_name = author.get('family', '')

                    data['DOI'].append(doi)
                    data['Title'].append(title)
                    data['Container Title'].append(container_title)
                    data['Publisher'].append(publisher)
                    data['Publish Date'].append(publish_date)
                    data['Author First Name'].append(first_name)
                    data['Author Last Name'].append(last_name)
                    data['Author Order'].append(order)
                    data['Referenced By'].append(referenced_by)
                    data['Year'].append(year)  # Append the current year

            subject_df = pd.DataFrame(data)
            subject_df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By','Year']
            subject_df['Subject'] = subject  # Add a Subject column with the current subject
            subject_dataframes.append(subject_df)

    except Exception as e:
        print(f"Error processing subject '{subject}': {str(e)}")
        continue

In [12]:
# Concatenate all DataFrames into one big DataFrame
df = pd.concat(subject_dataframes, ignore_index=True)

In [13]:
physical_sciences = ['Discrete Mathematics and Combinatorics', 'Atmospheric Science', 'General Physics and Astronomy',
                     'Physical and Theoretical Chemistry', 'General Chemistry', 'Analytical Chemistry', 'Statistical and Nonlinear Physics',
                     'Condensed Matter Physics', 'Spectroscopy', 'Nuclear and High Energy Physics', 'Atomic and Molecular Physics, and Optics']

life_sciences = ['Molecular Medicine', 'Physiology (medical)', 'Hepatology', 'Embryology', 'Animal Science and Zoology',
                 'Biological Psychiatry', 'Biochemistry', 'Experimental and Cognitive Psychology', 'Infectious Diseases', 'Aquatic Science',
                 'Biochemistry (medical)', 'Genetics', 'Neurology', 'Hematology', 'Health, Toxicology and Mutagenesis', 'Developmental Biology',
                 'Biomedical Engineering', 'Molecular Biology', 'Behavioral Neuroscience', 'Catalysis', 'Biophysics', 'Rheumatology', 'Anatomy',
                 'Endocrinology', 'Cell Biology', 'Immunology and Allergy', 'Genetics (clinical)', 'Pediatrics, Perinatology and Child Health',
                 'Oncology', 'Biomaterials', 'Cardiology and Cardiovascular Medicine', 'Immunology', 'Cancer Research']

engineering_tech = ['Surfaces and Interfaces', 'Management Science and Operations Research', 'Energy Engineering and Power Technology',
                    'Control and Systems Engineering', 'Artificial Intelligence', 'Hardware and Architecture', 'Waste Management and Disposal',
                    'Modeling and Simulation', 'Signal Processing', 'Mechanical Engineering', 'Environmental Engineering', 'Software',
                    'Information Systems', 'Computer Graphics and Computer-Aided Design', 'Electrical and Electronic Engineering',
                    'Aerospace Engineering', 'Computer Science Applications', 'Ocean Engineering', 'Computational Mechanics', 'Nuclear Energy and Engineering',
                    'Electronic, Optical and Magnetic Materials', 'Instrumentation']

humanities_arts = ['General Arts and Humanities', 'Visual Arts and Performing Arts', 'History and Philosophy of Science', 'Cultural Studies', 
                   'Philosophy', 'Classics', 'Communication', 'Religious studies', 'Museology', 'Music', 'History']

social_sciences = ['General Psychology', 'Geography, Planning and Development', 'Sociology and Political Science', 'Political Science and International Relations', 
                   'General Social Sciences', 'Anthropology', 'Demography']

medical_health = ['Family Practice', 'Geriatrics and Gerontology', 'General Dentistry', 'Obstetrics and Gynecology', 'Rehabilitation', 'Medicine (miscellaneous)',
                  'Health Professions (miscellaneous)', 'Psychiatry and Mental health', 'Nephrology', 'Pulmonary and Respiratory Medicine',
                  'Internal Medicine', 'Clinical Biochemistry', 'Orthopedics and Sports Medicine', 'Surgery', 'Dermatology', 'Gastroenterology', 'Urology',
                  'Otorhinolaryngology', 'Radiology, Nuclear Medicine and imaging', 'Ophthalmology']

environmental_sciences = ['Oceanography', 'Soil Science', 'Global and Planetary Change', 'General Environmental Science', 'Pollution', 'Water Science and Technology',
                          'Ecology', 'Environmental Chemistry', 'Ecology, Evolution, Behavior and Systematics']

business_management = ['Marketing', 'Business and International Management', 'Organizational Behavior and Human Resource Management', 'Finance',
                       'Industrial relations', 'Leadership and Management', 'Strategy and Management']

computer_science = ['Computational Theory and Mathematics', 'Information Systems and Management', 'Management Information Systems', 
                    'Computer Networks and Communications', 'Theoretical Computer Science', 'Health Informatics', 'Software', 'Computer Science (miscellaneous)',
                    'General Computer Science']

others = [subject for subject in subjects if subject not in physical_sciences + life_sciences + engineering_tech +
          humanities_arts + social_sciences + medical_health + environmental_sciences + business_management + computer_science]

categorized_subjects = {
    'Physical Sciences': physical_sciences,
    'Life Sciences': life_sciences,
    'Engineering & Technology': engineering_tech,
    'Humanities & Arts': humanities_arts,
    'Social Sciences': social_sciences,
    'Medical & Health': medical_health,
    'Environmental Sciences': environmental_sciences,
    'Business & Management': business_management,
    'Computer Science': computer_science,
    'Others': others
}

# Creating a reverse mapping from subject to category
subject_to_category = {}
for category, subjects_in_category in categorized_subjects.items():
    for subject in subjects_in_category:
        subject_to_category[subject] = category

# Mapping the subjects to their broader category
df['Topic'] = df['Subject'].map(subject_to_category)

# If there's any subject that wasn't categorized, you can fill NaN values with 'Others'
df['Topic'].fillna('Others', inplace=True)

In [14]:
# Export it
df.to_csv('MED_Crossreff_pull_100.csv', index=False)