In [2]:
import requests
import pandas as pd

# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)
subjects_list

['Health (social science)',
 'Developmental Biology',
 'Nutrition and Dietetics',
 'General Immunology and Microbiology',
 'Nephrology',
 'Organic Chemistry',
 'Computer Graphics and Computer-Aided Design',
 'Marketing',
 'Management, Monitoring, Policy and Law',
 'Neurology',
 'Oceanography',
 'General Economics, Econometrics and Finance',
 'Histology',
 'Surfaces, Coatings and Films',
 'Medicine (miscellaneous)',
 'Hepatology',
 'Metals and Alloys',
 'Immunology',
 'Biochemistry (medical)',
 'Strategy and Management',
 'Speech and Hearing',
 'Computer Networks and Communications',
 'Theoretical Computer Science',
 'Biomedical Engineering',
 'Automotive Engineering',
 'Cultural Studies',
 'Biomaterials',
 'Biochemistry',
 'Visual Arts and Performing Arts',
 'General Neuroscience',
 'Communication',
 'Physical and Theoretical Chemistry',
 'Ecology, Evolution, Behavior and Systematics',
 'Demography',
 'Surgery',
 'Colloid and Surface Chemistry',
 'Control and Systems Engineering',
 'Ma

In [5]:
import requests
import pandas as pd

subject = 'Mechanical Engineering' # Specify the subject you want to search for

rows = 1000

def get_articles_by_subject(subject, year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'query.bibliographic': subject  # Specify the subject you want to search for
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])
            
            if not items:
                break
            
            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

subject_to_search = 'your_subject_here'  # Replace 'your_subject_here' with the subject you want to search for
years_to_pull = [2020, 2021, 2022, 2023]
all_articles = []

for year in years_to_pull:
    articles = get_articles_by_subject(subject_to_search, year, rows=rows)
    all_articles.extend(articles)

data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': [],
    'Referenced By': []
}

for article in all_articles:
    doi = article.get('DOI', '')
    title = article.get('title', [''])[0]
    container_title = article.get('container-title', [''])[0]
    publisher = article.get('publisher', '')
    publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
    authors = article.get('author', [])
    referenced_by = article.get('is-referenced-by-count', 0)

    for order, author in enumerate(authors, start=1):
        first_name = author.get('given', '')
        last_name = author.get('family', '')
        
        data['DOI'].append(doi)
        data['Title'].append(title)
        data['Container Title'].append(container_title)
        data['Publisher'].append(publisher)
        data['Publish Date'].append(publish_date)
        data['Author First Name'].append(first_name)
        data['Author Last Name'].append(last_name)
        data['Author Order'].append(order)
        data['Referenced By'].append(referenced_by)

test = pd.DataFrame(data)
test.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
test['Subject'] = subject

In [6]:
print(test.shape)
print(test.head())

(2827, 10)
                                       DOI  \
0  10.5810/kentucky/9780813179339.003.0012   
1      10.18574/nyu/9780814764947.001.0001   
2  10.5810/kentucky/9780813179339.003.0022   
3                   10.2307/j.ctv17vf5fj.5   
4                   10.2307/j.ctv17vf5fj.2   

                                               Title          Container Title  \
0            Affectionately, Your Friend and Brother  Liberty Brought Us Here   
1                                       Your Ad Here                            
2  Men of Advanced Views on the Subject of Education  Liberty Brought Us Here   
3                                   Acknowledgements    Here Comes your King!   
4                                           Foreword    Here Comes your King!   

                      Publisher    Publish Date Author First Name  \
0  University Press of Kentucky   [2020, 7, 21]          Susan E.   
1     New York University Press  [2020, 12, 31]           Michael   
2  University Pres

In [9]:
import requests
import pandas as pd

subjects = ['Environmental Engineering', 'Mechanical Engineering']  
rows = 1000
years_to_pull = [2020, 2021, 2022, 2023]  # Define the years you want to pull data for

def get_articles_by_subject(subject, year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'query.bibliographic': subject
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])
            
            if not items:
                break
            
            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

# Initialize an empty list to store DataFrames for each subject
subject_dataframes = []

# Loop through each subject and fetch data
for subject in subjects:
    all_articles = []
    for year in years_to_pull:
        articles = get_articles_by_subject(subject, year, rows=rows)
        all_articles.extend(articles)

        data = {
            'DOI': [],
            'Title': [],
            'Container Title': [],
            'Publisher': [],
            'Publish Date': [],
            'Author First Name': [],
            'Author Last Name': [],
            'Author Order': [],
            'Referenced By': []
        }

        for article in all_articles:
            doi = article.get('DOI', '')
            title = article.get('title', [''])[0]
            container_title = article.get('container-title', [''])[0]
            publisher = article.get('publisher', '')
            publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
            authors = article.get('author', [])
            referenced_by = article.get('is-referenced-by-count', 0)

            for order, author in enumerate(authors, start=1):
                first_name = author.get('given', '')
                last_name = author.get('family', '')
                
                data['DOI'].append(doi)
                data['Title'].append(title)
                data['Container Title'].append(container_title)
                data['Publisher'].append(publisher)
                data['Publish Date'].append(publish_date)
                data['Author First Name'].append(first_name)
                data['Author Last Name'].append(last_name)
                data['Author Order'].append(order)
                data['Referenced By'].append(referenced_by)

        subject_df = pd.DataFrame(data)
        subject_df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
        subject_df['Subject'] = subject  # Add a Subject column with the current subject
        subject_dataframes.append(subject_df)

# Concatenate all DataFrames into one big DataFrame
final_df = pd.concat(subject_dataframes, ignore_index=True)


In [12]:
# Group by 'Subject' and count the number of distinct titles for each subject
subject_title_counts = final_df.groupby('Subject')['Title'].nunique().reset_index()

# Rename the columns for clarity
subject_title_counts.columns = ['Subject', 'Unique Title Count']

# Display the resulting DataFrame
print(final_df.shape)
print(subject_title_counts)

(61737, 10)
                     Subject  Unique Title Count
0  Environmental Engineering                3712
1     Mechanical Engineering                2462


In [11]:
print(final_df.head())

(61737, 10)
6172
                DOI                                              Title  \
0  10.3397/1/376842  Uncorrelated blocked force determination on pl...   
1  10.3397/1/376842  Uncorrelated blocked force determination on pl...   
2  10.3397/1/376842  Uncorrelated blocked force determination on pl...   
3  10.3397/1/376842  Uncorrelated blocked force determination on pl...   
4  10.3397/1/376842  Uncorrelated blocked force determination on pl...   

                     Container Title  \
0  Noise Control Engineering Journal   
1  Noise Control Engineering Journal   
2  Noise Control Engineering Journal   
3  Noise Control Engineering Journal   
4  Noise Control Engineering Journal   

                                       Publisher   Publish Date  \
0  Institute of Noise Control Engineering (INCE)  [2020, 11, 1]   
1  Institute of Noise Control Engineering (INCE)  [2020, 11, 1]   
2  Institute of Noise Control Engineering (INCE)  [2020, 11, 1]   
3  Institute of Noise Control

In [3]:
import requests
import pandas as pd

subjects = subjects_list # Replace with your list of subjects
rows = 1000
years_to_pull = [2020, 2021, 2022, 2023]  # Define the years you want to pull data for

def get_articles_by_subject(subject, year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'query.bibliographic': subject
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])

            if not items:
                break

            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

# Initialize an empty list to store DataFrames for each subject
subject_dataframes = []

# Loop through each subject and fetch data
for subject in subjects_list:
    try:
        all_articles = []
        for year in years_to_pull:
            articles = get_articles_by_subject(subject, year, rows=rows)
            all_articles.extend(articles)

            data = {
                'DOI': [],
                'Title': [],
                'Container Title': [],
                'Publisher': [],
                'Publish Date': [],
                'Author First Name': [],
                'Author Last Name': [],
                'Author Order': [],
                'Referenced By': []
            }

            for article in all_articles:
                doi = article.get('DOI', '')
                title = article.get('title', [''])[0]
                container_title = article.get('container-title', [''])[0]
                publisher = article.get('publisher', '')
                publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
                authors = article.get('author', [])
                referenced_by = article.get('is-referenced-by-count', 0)

                for order, author in enumerate(authors, start=1):
                    first_name = author.get('given', '')
                    last_name = author.get('family', '')

                    data['DOI'].append(doi)
                    data['Title'].append(title)
                    data['Container Title'].append(container_title)
                    data['Publisher'].append(publisher)
                    data['Publish Date'].append(publish_date)
                    data['Author First Name'].append(first_name)
                    data['Author Last Name'].append(last_name)
                    data['Author Order'].append(order)
                    data['Referenced By'].append(referenced_by)

            subject_df = pd.DataFrame(data)
            subject_df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
            subject_df['Subject'] = subject  # Add a Subject column with the current subject
            subject_dataframes.append(subject_df)

    except Exception as e:
        print(f"Error processing subject '{subject}': {str(e)}")
        continue

# Concatenate all DataFrames into one big DataFrame
massive_test = pd.concat(subject_dataframes, ignore_index=True)

In [5]:
# Group by 'Subject' and count the number of distinct titles for each subject
subject_title_counts = massive_test.groupby('Subject')['Title'].nunique().reset_index()

# Rename the columns for clarity
subject_title_counts.columns = ['Subject', 'Unique Title Count']

# Display the resulting DataFrame
print(massive_test.shape)
print(subject_title_counts)

(6078017, 10)
                                               Subject  Unique Title Count
0                            Acoustics and Ultrasonics                3736
1                                Aerospace Engineering                3531
2                                                Aging                3014
3    Agricultural and Biological Sciences (miscella...                3041
4                            Agronomy and Crop Science                3664
..                                                 ...                 ...
231                                            Urology                 878
232                                           Virology                3462
233                    Visual Arts and Performing Arts                3077
234                      Waste Management and Disposal                3621
235                       Water Science and Technology                2843

[236 rows x 2 columns]


In [6]:
print(massive_test.head())

                               DOI  \
0  10.1016/j.socscimed.2019.02.004   
1  10.1016/j.socscimed.2020.113505   
2  10.1016/j.socscimed.2020.113505   
3  10.1016/j.socscimed.2020.113417   
4  10.1016/j.socscimed.2020.113466   

                                               Title  \
0  Social capital, social movements and global pu...   
1  Health information provision, health knowledge...   
2  Health information provision, health knowledge...   
3  Neighborhood social capital and adolescents’ i...   
4  The Amish health culture and culturally sensit...   

                 Container Title    Publisher Publish Date Author First Name  \
0  Social Science &amp; Medicine  Elsevier BV    [2020, 7]         Catherine   
1  Social Science &amp; Medicine  Elsevier BV   [2020, 11]             Peter   
2  Social Science &amp; Medicine  Elsevier BV   [2020, 11]          Léontine   
3  Social Science &amp; Medicine  Elsevier BV   [2020, 11]              Jaap   
4  Social Science &amp; Medicine  