In [None]:
import requests
import pandas as pd

# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=10"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)[10:20]
subjects = subjects_list
rows = 10

# Define the years you want to pull data for
years_to_pull = [2021, 2022] 

def get_articles_by_subject(subject, year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'query.bibliographic': subject
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])

            if not items:
                break

            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

# Initialize an empty list to store DataFrames for each subject
subject_dataframes = []

# Loop through each subject and fetch data
for subject in subjects_list:
    try:
        all_articles = []
        for year in years_to_pull:
            print(subject,year)
            articles = get_articles_by_subject(subject, year, rows=rows)
            all_articles.extend(articles)

            data = {
                'DOI': [],
                'Title': [],
                'Container Title': [],
                'Publisher': [],
                'Publish Date': [],
                'Author First Name': [],
                'Author Last Name': [],
                'Author Order': [],
                'Referenced By': []
            }

            for article in all_articles:
                doi = article.get('DOI', '')
                title = article.get('title', [''])[0]
                container_title = article.get('container-title', [''])[0]
                publisher = article.get('publisher', '')
                publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
                authors = article.get('author', [])
                referenced_by = article.get('is-referenced-by-count', 0)

                for order, author in enumerate(authors, start=1):
                    first_name = author.get('given', '')
                    last_name = author.get('family', '')

                    data['DOI'].append(doi)
                    data['Title'].append(title)
                    data['Container Title'].append(container_title)
                    data['Publisher'].append(publisher)
                    data['Publish Date'].append(publish_date)
                    data['Author First Name'].append(first_name)
                    data['Author Last Name'].append(last_name)
                    data['Author Order'].append(order)
                    data['Referenced By'].append(referenced_by)

            subject_df = pd.DataFrame(data)
            subject_df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
            subject_df['Subject'] = subject  # Add a Subject column with the current subject
            subject_dataframes.append(subject_df)

    except Exception as e:
        print(f"Error processing subject '{subject}': {str(e)}")
        continue

# Concatenate all DataFrames into one big DataFrame
massive_Crossreff_pull = pd.concat(subject_dataframes, ignore_index=True)
massive_Crossreff_pull.head(100)

In [33]:
import requests
import pandas as pd
from tqdm import tqdm

# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

In [34]:
# Define the years you want to pull data for, as well as how many rows per subject/year
rows = 999 # 1 less than max
years_to_pull = range(2017, 2024,1) # +1 Year bc Python

def get_articles_by_subject(subject, year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'query.bibliographic': subject
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])

            if not items:
                break

            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

# Initialize an empty list to store DataFrames for each subject
subject_dataframes = []

# Prepare a progress bar
total_iterations = len(subjects_list) * len(years_to_pull)
progress_bar = tqdm(total=total_iterations, desc='Processing', dynamic_ncols=True)

# Loop through each subject and fetch data
for subject in subjects_list:
    try:
        all_articles = []
        for year in years_to_pull:
            # print(subject,year)  -- error testing line
            articles = get_articles_by_subject(subject, year, rows=rows)
            all_articles.extend(articles)

            progress_bar.update(1) # Remove later, testing purposes

            data = {
                'DOI': [],
                'Title': [],
                'Container Title': [],
                'Publisher': [],
                'Publish Date': [],
                'Author First Name': [],
                'Author Last Name': [],
                'Author Order': [],
                'Referenced By': []
            }

            for article in all_articles:
                doi = article.get('DOI', '')
                title = article.get('title', [''])[0]
                container_title = article.get('container-title', [''])[0]
                publisher = article.get('publisher', '')
                publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
                authors = article.get('author', [])
                referenced_by = article.get('is-referenced-by-count', 0)

                for order, author in enumerate(authors, start=1):
                    first_name = author.get('given', '')
                    last_name = author.get('family', '')

                    data['DOI'].append(doi)
                    data['Title'].append(title)
                    data['Container Title'].append(container_title)
                    data['Publisher'].append(publisher)
                    data['Publish Date'].append(publish_date)
                    data['Author First Name'].append(first_name)
                    data['Author Last Name'].append(last_name)
                    data['Author Order'].append(order)
                    data['Referenced By'].append(referenced_by)

            subject_df = pd.DataFrame(data)
            subject_df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
            subject_df['Subject'] = subject  # Add a Subject column with the current subject
            subject_dataframes.append(subject_df)

    except Exception as e:
        print(f"Error processing subject '{subject}': {str(e)}")
        continue

# Close the progress bar
progress_bar.close()

# Concatenate all DataFrames into one big DataFrame
massive_Crossreff_pull = pd.concat(subject_dataframes, ignore_index=True)
massive_Crossreff_pull.head(10)

Processing: 100%|██████████| 1645/1645 [2:16:48<00:00,  4.99s/it]  


Unnamed: 0,DOI,Title,Container Title,Publisher,Publish Date,Author First Name,Author Last Name,Author Order,Referenced By,Subject
0,10.4103/2278-9626.198585,Management of patients taking rivaroxaban for ...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Adrian,Curto,1.0,3.0,General Dentistry
1,10.4103/2278-9626.198593,Endodontics and forensic personal identificati...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Hany Mohamed Aly,Ahmed,1.0,2.0,General Dentistry
2,10.4103/2278-9626.198603,Mobile learning practices and preferences a wa...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Jane,Manakil,1.0,3.0,General Dentistry
3,10.4103/2278-9626.198603,Mobile learning practices and preferences a wa...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Roy,George,2.0,3.0,General Dentistry
4,10.4103/2278-9626.198622,Distribution of hypodontia and hyperdontia in ...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Yong,Chen,1.0,0.0,General Dentistry
5,10.4103/2278-9626.198622,Distribution of hypodontia and hyperdontia in ...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Sreekanth Kumar,Mallineni,2.0,0.0,General Dentistry
6,10.4103/2278-9626.198602,The role of acupuncture in the treatment of pr...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Raghad,Hashim,1.0,1.0,General Dentistry
7,10.4103/2278-9626.198602,The role of acupuncture in the treatment of pr...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Reem,Shaltoni,2.0,1.0,General Dentistry
8,10.4103/2278-9626.198602,The role of acupuncture in the treatment of pr...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Luma,Kamal,3.0,1.0,General Dentistry
9,10.4103/2278-9626.198602,The role of acupuncture in the treatment of pr...,European Journal of General Dentistry,Georg Thieme Verlag KG,"[2017, 1]",Faten,Khanfar,4.0,1.0,General Dentistry


In [35]:
massive_Crossreff_pull.to_csv('massive_Crossreff_pull.csv', index=False)