In [None]:
import requests
from tqdm import tqdm
import pandas as pd

def fetch_details(doi, cache):
    # If the details for the DOI are already in the cache
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        url = f"https://api.crossref.org/works/{doi}"
        try:
            response = requests.get(url)
            response.raise_for_status() 
            data = response.json()
            message = data['message']

            # Extract common details
            title = message['title'][0] if 'title' in message else None
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])[0]
            field = message.get('subject', [None])[0]
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)

            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal,
                'Field': field,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }

            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}

        except requests.RequestException as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []
        except (KeyError, IndexError) as e:
            print(f"Unexpected structure in response for DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list

# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

# Initialize a DataFrame to store DOIs
doi_df = pd.DataFrame(columns=['DOI'])

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(subjects_list), desc="Fetching DOIs")

# Step 2: Iterate through each subject title and fetch 25 unique DOIs
for subject_name in subjects_list:
    doi_url = f"https://api.crossref.org/works?query=subject:{subject_name}&rows=25"
    response = requests.get(doi_url)
    data = response.json()
    subject_dois = [item['DOI'] for item in data.get('message', {}).get('items', [])]
    
    # Add the fetched DOIs to the doi_df DataFrame
    temp_doi_df = pd.DataFrame(subject_dois, columns=['DOI'])
    doi_df = pd.concat([doi_df, temp_doi_df], ignore_index=True)
    
    # Fetch details for each DOI and populate the full_df DataFrame
    data_list = []
    for doi in subject_dois:
        details = fetch_details(doi, doi_cache)
        data_list.extend(details)
    
    # Convert list of dictionaries to DataFrame
    new_data_df = pd.DataFrame(data_list)
    
    # Update the progress bar
    pbar.update(1)

    # Break the loop if we have fetched details for all DOIs
    if len(doi_df) >= 25 * len(subjects_list):
        break

# Close the progress bar
pbar.close()

# Display the resulting full_df DataFrame with DOI details
print(new_data_df.head())

In [None]:
import requests
from tqdm import tqdm
import pandas as pd

def fetch_details(doi, cache):
    if doi in cache:
        return cache[doi]

    url = f"https://api.crossref.org/works/{doi}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        message = data.get('message', {})

        title = message.get('title', [None])[0]
        abstract = message.get('abstract', None)
        journal = message.get('container-title', [None])[0]
        field = message.get('subject', [None])[0]
        citation_count = message.get('is-referenced-by-count', None)
        date_received = message.get('created', {}).get('date-time', None)
        date_published = message.get('published-online', {}).get('date-parts', [None])[0]
        address = message.get('publisher-location', None)
        language = message.get('language', None)

        common_details = {
            'DOI': doi,
            'Title': title,
            'Abstract': abstract,
            'Journal': journal,
            'Field': field,
            'Citation Count': citation_count,
            'Date Received': date_received,
            'Date Published': date_published,
            'Address': address,
            'Language': language
        }

        authors = message.get('author', [])
        details = {'common': common_details, 'authors': authors}
        cache[doi] = details
        return details

    except requests.RequestException as e:
        print(f"Error fetching DOI {doi}: {e}")
        return {'common': None, 'authors': []}
    except (KeyError, IndexError) as e:
        print(f"Unexpected structure in response for DOI {doi}: {e}")
        return {'common': None, 'authors': []}


URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

subjects_list = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_list.update(item['subject'])

subjects_list = list(subjects_list)
doi_df = pd.DataFrame(columns=['DOI'])
doi_cache = {}
pbar = tqdm(total=len(subjects_list), desc="Fetching DOIs")

for subject_name in subjects_list:
    doi_url = f"https://api.crossref.org/works?query=subject:{subject_name}&rows=25"
    response = requests.get(doi_url)
    data = response.json()
    subject_dois = [item['DOI'] for item in data.get('message', {}).get('items', [])]

    temp_doi_df = pd.DataFrame(subject_dois, columns=['DOI'])
    doi_df = pd.concat([doi_df, temp_doi_df], ignore_index=True)

    data_list = []
    for doi in subject_dois:
        details = fetch_details(doi, doi_cache)
        if details['common']:
            data_list.extend(details)

    new_data_df = pd.DataFrame(data_list)

    pbar.update(1)

    if len(doi_df) >= 25 * len(subjects_list):
        break

pbar.close()
print(new_data_df.head())

In [20]:
import requests

# DOI to fetch details for
doi = "10.3403/30087604"

# URL to fetch details from CrossRef API
url = f"https://api.crossref.org/works/{doi}"

# Fetch the data
response = requests.get(url)
data = response.json()

# Display the entire message
print(data['message'])


{'indexed': {'date-parts': [[2022, 3, 31]], 'date-time': '2022-03-31T15:25:27Z', 'timestamp': 1648740327902}, 'publisher-location': 'London', 'standards-body': {'name': 'BSI British Standards', 'acronym': 'BSI'}, 'reference-count': 0, 'publisher': 'BSI British Standards', 'content-domain': {'domain': [], 'crossmark-restriction': False}, 'short-container-title': [], 'DOI': '10.3403/30087604', 'type': 'standard', 'created': {'date-parts': [[2013, 11, 13]], 'date-time': '2013-11-13T16:07:16Z', 'timestamp': 1384358836000}, 'approved': {'date-parts': [[2008, 12, 31]]}, 'source': 'Crossref', 'is-referenced-by-count': 1, 'title': ['Building and civil engineering. Vocabulary'], 'prefix': '10.3403', 'member': '1988', 'container-title': [], 'original-title': [], 'deposited': {'date-parts': [[2017, 12, 5]], 'date-time': '2017-12-05T07:43:55Z', 'timestamp': 1512459835000}, 'score': 1, 'resource': {'primary': {'URL': 'https://linkresolver.bsigroup.com/junction/resolve/000000000030087604?restype=sta

In [None]:
import requests

# DOI to fetch details for
doi = "10.5194/angeo-2018-95-rc4"

# URL to fetch details from CrossRef API
url = f"https://api.crossref.org/works/{doi}"

# Fetch the data
response = requests.get(url)
data = response.json()

# Extracting common details
message = data['message']
title = message['title'][0] if 'title' in message else None
abstract = message.get('abstract', None)
journal = message.get('container-title', [None])[0]
field = message.get('subject', [None])[0]
citation_count = message.get('is-referenced-by-count', None)
date_received = message.get('created', {}).get('date-time', None)
date_published = message.get('published-online', {}).get('date-parts', [None])[0]
address = message.get('publisher-location', None)
language = message.get('language', None)

# Display the extracted details
print("Title:", title)
print("Abstract:", abstract)
print("Journal:", journal)
print("Field:", field)
print("Citation Count:", citation_count)
print("Date Received:", date_received)
print("Date Published:", date_published)
print("Address:", address)
print("Language:", language)


In [None]:
import requests

# DOI to fetch details for
doi = "10.5194/angeo-2018-95-rc4"

# URL to fetch details from CrossRef API
url = f"https://api.crossref.org/works/{doi}"

# Fetch the data
response = requests.get(url)
data = response.json()

# Accessing specific fields and handling empty values
indexed_date = message.get('indexed', {}).get('date-time', None)
posted_date = message.get('posted', {}).get('date-parts', [])[0] if 'posted' in message else None
publisher = message.get('publisher', None)
reference_count = message.get('reference-count', None)
content_domain = message.get('content-domain', {}).get('domain', [])
short_container_title = message.get('short-container-title', [])
type_ = message.get('type', None)
created_date = message.get('created', {}).get('date-time', None)
source = message.get('source', None)
is_referenced_by_count = message.get('is-referenced-by-count', None)
title = message.get('title', [])
prefix = message.get('prefix', None)
authors = message.get('author', [])
member = message.get('member', None)
container_title = message.get('container-title', [])
deposited_date = message.get('deposited', {}).get('date-time', None)
score = message.get('score', None)
resource_url = message.get('resource', {}).get('primary', {}).get('URL', None)
subtitle = message.get('subtitle', [])
short_title = message.get('short-title', [])
issued_date = message.get('issued', {}).get('date-parts', [])[0] if 'issued' in message else None
references_count = message.get('references-count', None)
url = message.get('URL', None)
relation_is_reply_to = message.get('relation', {}).get('is-reply-to', [])
relation_has_reply = message.get('relation', {}).get('has-reply', [])
published_date = message.get('published', {}).get('date-parts', [])[0] if 'published' in message else None
subtype = message.get('subtype', None)

# Displaying the extracted fields
print("Indexed Date:", indexed_date)
print("Posted Date:", posted_date)
print("Publisher:", publisher)
print("Reference Count:", reference_count)
print("Content Domain:", content_domain)
print("Short Container Title:", short_container_title)
print("Type:", type_)
print("Created Date:", created_date)
print("Source:", source)
print("Is Referenced By Count:", is_referenced_by_count)
print("Title:", title)
print("Prefix:", prefix)
print("Authors:", authors)
print("Member:", member)
print("Container Title:", container_title)
print("Deposited Date:", deposited_date)
print("Score:", score)
print("Resource URL:", resource_url)
print("Subtitle:", subtitle)
print("Short Title:", short_title)
print("Issued Date:", issued_date)
print("References Count:", references_count)
print("URL:", url)
print("Relation (is reply to):", relation_is_reply_to)
print("Relation (has reply):", relation_has_reply)
print("Published Date:", published_date)
print("Subtype:", subtype)


In [18]:
import requests

# DOI to fetch details for
doi = "10.3403/30087604"

# URL to fetch details from CrossRef API
url = f"https://api.crossref.org/works/{doi}"

# Fetch the data
response = requests.get(url)
data = response.json()

# Accessing specific fields and handling empty values
indexed_date = message.get('indexed', {}).get('date-time', None)
posted_date = message.get('posted', {}).get('date-parts', [])[0] if 'posted' in message else None
publisher = message.get('publisher', None)
reference_count = message.get('reference-count', None)
content_domain = message.get('content-domain', {}).get('domain', [])
short_container_title = message.get('short-container-title', [])
type_ = message.get('type', None)
created_date = message.get('created', {}).get('date-time', None)
source = message.get('source', None)
is_referenced_by_count = message.get('is-referenced-by-count', None)
title = message.get('title', [])
prefix = message.get('prefix', None)
authors = message.get('author', [])
member = message.get('member', None)
container_title = message.get('container-title', [])
deposited_date = message.get('deposited', {}).get('date-time', None)
score = message.get('score', None)
resource_url = message.get('resource', {}).get('primary', {}).get('URL', None)
subtitle = message.get('subtitle', [])
short_title = message.get('short-title', [])
issued_date = message.get('issued', {}).get('date-parts', [])[0] if 'issued' in message else None
references_count = message.get('references-count', None)
url = message.get('URL', None)
relation_is_reply_to = message.get('relation', {}).get('is-reply-to', [])
relation_has_reply = message.get('relation', {}).get('has-reply', [])
published_date = message.get('published', {}).get('date-parts', [])[0] if 'published' in message else None
subtype = message.get('subtype', None)

# Displaying the extracted fields
print("Indexed Date:", indexed_date)
print("Posted Date:", posted_date)
print("Publisher:", publisher)
print("Reference Count:", reference_count)
print("Content Domain:", content_domain)
print("Short Container Title:", short_container_title)
print("Type:", type_)
print("Created Date:", created_date)
print("Source:", source)
print("Is Referenced By Count:", is_referenced_by_count)
print("Title:", title)
print("Prefix:", prefix)
print("Authors:", authors)
print("Member:", member)
print("Container Title:", container_title)
print("Deposited Date:", deposited_date)
print("Score:", score)
print("Resource URL:", resource_url)
print("Subtitle:", subtitle)
print("Short Title:", short_title)
print("Issued Date:", issued_date)
print("References Count:", references_count)
print("URL:", url)
print("Relation (is reply to):", relation_is_reply_to)
print("Relation (has reply):", relation_has_reply)
print("Published Date:", published_date)
print("Subtype:", subtype)


NameError: name 'message' is not defined

In [21]:
import requests
from tqdm import tqdm
import pandas as pd

def fetch_details(doi, cache):
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        url = f"https://api.crossref.org/works/{doi}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            message = data.get('message', {})
            
            # Extract common details
            title = message.get('title', [None])[0]
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])[0]
            field = message.get('subject', [None])[0]
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)
            
            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal,
                'Field': field,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }
            
            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}
            
        except requests.RequestException as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []
        except (KeyError, IndexError) as e:
            print(f"Unexpected structure in response for DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list

# Rest of your code (Step 1, Step 2, and the progress bar)...
# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

# Initialize a DataFrame to store DOIs
doi_df = pd.DataFrame(columns=['DOI'])

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(subjects_list), desc="Fetching DOIs")

# Step 2: Iterate through each subject title and fetch 25 unique DOIs
for subject_name in subjects_list:
    doi_url = f"https://api.crossref.org/works?query=subject:{subject_name}&rows=25"
    response = requests.get(doi_url)
    data = response.json()
    subject_dois = [item['DOI'] for item in data.get('message', {}).get('items', [])]
    
    # Add the fetched DOIs to the doi_df DataFrame
    temp_doi_df = pd.DataFrame(subject_dois, columns=['DOI'])
    doi_df = pd.concat([doi_df, temp_doi_df], ignore_index=True)
    
    # Fetch details for each DOI and populate the full_df DataFrame
    data_list = []
    for doi in subject_dois:
        details = fetch_details(doi, doi_cache)
        data_list.extend(details)
    
    # Convert list of dictionaries to DataFrame
    new_data_df = pd.DataFrame(data_list)
    
    # Update the progress bar
    pbar.update(1)

    # Break the loop if we have fetched details for all DOIs
    if len(doi_df) >= 25 * len(subjects_list):
        break

# Close the progress bar
pbar.close()

# Display the resulting full_df DataFrame with DOI details
print(new_data_df.head())

Fetching DOIs:   0%|          | 0/233 [00:00<?, ?it/s]

Unexpected structure in response for DOI 10.5194/angeo-2018-95-rc4: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-ac2: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-rc1: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-rc2: list index out of range
Unexpected structure in response for DOI 10.47688/rba_archives_2006/18711: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-ac5: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-ac1: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-ac4: list index out of range
Unexpected structure in response for DOI 10.3403/02855532: list index out of range
Unexpected structure in response for DOI 10.5194/angeo-2018-95-ac3: list index out of range
Unexpected structure in response for DOI 10.3403/bsen60838-2: list index out of ra

Fetching DOIs:   1%|▏         | 3/233 [00:06<05:27,  1.42s/it]

Unexpected structure in response for DOI 10.4000/interfaces: list index out of range
Unexpected structure in response for DOI 10.4000/interfaces.3227: list index out of range


Fetching DOIs:   3%|▎         | 7/233 [00:17<08:34,  2.28s/it]

Unexpected structure in response for DOI 10.19064/2015.119: list index out of range
Unexpected structure in response for DOI 10.13188/2327-204x.1000004: list index out of range
Unexpected structure in response for DOI 10.3403/00064522u: list index out of range
Unexpected structure in response for DOI 10.3403/00064522: list index out of range
Unexpected structure in response for DOI 10.31579/2693-7247/030: list index out of range
Unexpected structure in response for DOI 10.25107/2573-6051: list index out of range


KeyboardInterrupt: 

In [29]:
import pandas as pd
from tqdm import tqdm
from crossref.restful import Works
from crossref.restful import Journals
from habanero import Crossref
from requests.exceptions import HTTPError

def fetch_details(doi, cache):
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        try:
            work = Journals().works(ids=doi)
            message = work['message']
            
            # Extract common details
            title = message.get('title', [None])[0]
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])[0]
            field = message.get('subject', [None])[0]
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)
            
            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal,
                'Field': field,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }
            
            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}
            
        except Exception as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list

# Rest of your code (Step 1, Step 2, and the progress bar)...

def fetch_dois(subject_list, cr):
    doi_list = []
    for subject_name in subject_list:
        try:
            items = cr.works(query=f'subject:{subject_name}', rows=25)
            subject_dois = [item['DOI'] for item in items['message']['items']]
        except HTTPError as e:
            print(f"Error querying Crossref: {e}")
            subject_dois = []

        doi_list.extend(subject_dois)
    
        if len(doi_list) >= 25 * len(subject_list):
            break
    
    return doi_list


def fetch_dois(subject_list, cr):
    doi_list = []
    for subject_name in subject_list:
        try:
            items = cr.works(filter={'subject': subject_name}, rows=25)
            subject_dois = [item['DOI'] for item in items['message']['items']]
        except HTTPError as e:
            print(f"Error querying Crossref: {e}")
            subject_dois = []

        doi_list.extend(subject_dois)
    
        if len(doi_list) >= 25 * len(subject_list):
            break
    
    return doi_list

# Rest of your code (Step 1, Step 2, and the progress bar)...

# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
cr = Crossref()

try:
    items = cr.works(filter={'has-abstract': True}, rows=1000)
    items = items['message']['items']
except HTTPError as e:
    print(f"Error querying Crossref: {e}")
    items = []

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in items:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

# Fetch DOIs for each subject
doi_list = fetch_dois(subjects_list, cr)

# Initialize a DataFrame to store DOIs
doi_df = pd.DataFrame({'DOI': doi_list})

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(doi_list), desc="Fetching DOI Details")

# Fetch details for each DOI and populate the new_data_df DataFrame
data_list = []
for doi in doi_list:
    details = fetch_details(doi, doi_cache)
    data_list.extend(details)
    pbar.update(1)

# Convert list of dictionaries to DataFrame
new_data_df = pd.DataFrame(data_list)

# Close the progress bar
pbar.close()

# Display the resulting new_data_df DataFrame with DOI details
print(new_data_df.head())

Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3AHematology
Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3APhysiology
Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3AGeneral+Environmental+Science
Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3AGeneral+Medicine
Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3AGeneral+Business%2C+Management+and+Accounting
Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3AGeneral+Chemistry
Error querying Crossref: 400 Client Error: Bad Request for url: https://api.crossref.org/works?filter=subject%3AReligious+studies
Error querying Crossref: 400 Client Error: Bad Request for url: h

Fetching DOI Details: 0it [00:00, ?it/s]

Empty DataFrame
Columns: []
Index: []





In [23]:
details_list.describe

NameError: name 'details_list' is not defined

In [30]:
import requests
import pandas as pd
from tqdm import tqdm
from requests.exceptions import HTTPError

def fetch_details(doi, cache):
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        try:
            response = requests.get(f"https://api.crossref.org/works/{doi}")
            response.raise_for_status()
            data = response.json()
            message = data['message']
            
            # Extract common details
            title = message.get('title', [None])[0]
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])
            field = message.get('subject', [None])
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)
            
            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal[0] if journal else None,
                'Field': field[0] if field else None,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }
            
            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}
            
        except HTTPError as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list


# Rest of your code (Step 1, Step 2, and the progress bar)...

# Step 1: Query CrossRef for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract items from the returned data
items = data.get('message', {}).get('items', [])

# Extract subjects from the returned items and add to a set for uniqueness
subjects_set = set()
for item in items:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

# Fetch DOIs for each subject
doi_list = []
for subject_name in tqdm(subjects_list, desc="Fetching DOIs"):
    try:
        doi_url = f"https://api.crossref.org/works?query=subject:{subject_name}&rows=25"
        response = requests.get(doi_url)
        data = response.json()
        subject_dois = [item['DOI'] for item in data.get('message', {}).get('items', [])]
        doi_list.extend(subject_dois)
    except HTTPError as e:
        print(f"Error querying Crossref for subject {subject_name}: {e}")

# Initialize a DataFrame to store DOIs
doi_df = pd.DataFrame({'DOI': doi_list})

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(doi_list), desc="Fetching DOI Details")

# Fetch details for each DOI and populate the new_data_df DataFrame
data_list = []
for doi in doi_list:
    details = fetch_details(doi, doi_cache)
    data_list.extend(details)
    pbar.update(1)

# Convert list of dictionaries to DataFrame
new_data_df = pd.DataFrame(data_list)

# Close the progress bar
pbar.close()

# Display the resulting new_data_df DataFrame with DOI details
print(new_data_df.head())


Fetching DOIs: 100%|██████████| 233/233 [01:30<00:00,  2.57it/s]
Fetching DOI Details:   0%|          | 3/4325 [00:00<14:52,  4.84it/s]

IndexError: list index out of range

In [31]:
# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

In [32]:
subjects_list

['Nursing (miscellaneous)',
 'Hematology',
 'Gerontology',
 'Surfaces and Interfaces',
 'Toxicology',
 'Health, Toxicology and Mutagenesis',
 'Pollution',
 'General Pharmacology, Toxicology and Pharmaceutics',
 'Leadership and Management',
 'Business and International Management',
 'Nutrition and Dietetics',
 'Soil Science',
 'Plant Science',
 'Applied Mathematics',
 'Dermatology',
 'Radiation',
 'Chemical Engineering (miscellaneous)',
 'Pulmonary and Respiratory Medicine',
 'Mechanics of Materials',
 'Statistics and Probability',
 'General Business, Management and Accounting',
 'Colloid and Surface Chemistry',
 'Space and Planetary Science',
 'Geophysics',
 'General Physics and Astronomy',
 'Aerospace Engineering',
 'General Social Sciences',
 'Molecular Medicine',
 'Law',
 'History and Philosophy of Science',
 'Nuclear and High Energy Physics',
 'Catalysis',
 'Algebra and Number Theory',
 'Pharmacology',
 'Spectroscopy',
 'Multidisciplinary',
 'Education',
 'Social Sciences (miscella

In [34]:
import pandas as pd
from tqdm import tqdm
from crossref.restful import Works
from habanero import Crossref
from requests.exceptions import HTTPError

def fetch_details(doi, cache):
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        try:
            work = Works().doi(doi)
            message = work['message']
            
            # Extract common details
            title = message.get('title', [None])[0]
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])[0]
            field = message.get('subject', [None])[0]
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)
            
            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal,
                'Field': field,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }
            
            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}
            
        except HTTPError as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list

# Step 1: Specify the subject(s) you want to search for
subject_list = ['Immunology', 'Cell Biology']

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(subject_list), desc="Fetching DOIs")

# Fetch DOIs for each subject
doi_list = []
for subject_name in subject_list:
    try:
        items = Works().query(subject=subject_name, rows=25)
        subject_dois = [item['DOI'] for item in items['message']['items']]
    except HTTPError as e:
        print(f"Error querying Crossref: {e}")
        subject_dois = []

    doi_list.extend(subject_dois)
    
    pbar.update(1)

# Close the progress bar
pbar.close()

# Initialize a DataFrame to store DOIs
doi_df = pd.DataFrame({'DOI': doi_list})

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(doi_list), desc="Fetching DOI Details")

# Fetch details for each DOI and populate the new_data_df DataFrame
data_list = []
for doi in doi_list:
    details = fetch_details(doi, doi_cache)
    data_list.extend(details)
    pbar.update(1)

# Convert list of dictionaries to DataFrame
new_data_df = pd.DataFrame(data_list)

# Close the progress bar
pbar.close()

# Display the resulting new_data_df DataFrame with DOI details
print(new_data_df.head())


Fetching DOIs:   0%|          | 0/2 [00:54<?, ?it/s]


UrlSyntaxError: Field query subject specified but there is no such field query for this route. Valid field queries for this route are: affiliation, author, bibliographic, chair, container_title, contributor, editor, event_acronym, event_location, event_name, event_sponsor, event_theme, funder_name, publisher_location, publisher_name, translator

In [38]:
import requests
import random

def get_random_article(subject):
    base_url = f'https://api.crossref.org/works'
    params = {
        'query.bibliographic': subject,
        'sort': 'relevance',
        'rows': 1,
        'sample': 'true'
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        article = data['message']['items'][0]
        return article
    else:
        return None

subject = 'Cell Biology'
article = get_random_article(subject)

if article:
    doi = article.get('DOI', '')
    title = article.get('title', [''])[0]
    abstract = article.get('abstract', '')
    journal = article.get('container-title', [''])[0]
    field = ', '.join(article.get('subject', []))
    citation_count = article.get('is-referenced-by-count', 0)
    date_received = article.get('created', '')
    date_published = article.get('published-print', '')
    address = ', '.join(article.get('author', []))
    language = article.get('language', '')

    print(f"DOI: {doi}")
    print(f"Title: {title}")
    print(f"Abstract: {abstract}")
    print(f"Journal: {journal}")
    print(f"Field: {field}")
    print(f"Citation Count: {citation_count}")
    print(f"Date Received: {date_received}")
    print(f"Date Published: {date_published}")
    print(f"Address: {address}")
    print(f"Language: {language}")
else:
    print("No articles found for the specified subject.")


No articles found for the specified subject.


In [39]:
import requests
import random
import pandas as pd
from tqdm import tqdm  # Make sure to install the tqdm library

def get_random_doi():
    base_url = f'https://api.crossref.org/works/random'
    response = requests.get(base_url)
    if response.status_code == 200:
        data = response.json()
        doi = data['message']['DOI']
        return doi
    else:
        return None

def get_article_info(doi):
    base_url = f'https://api.crossref.org/works/{doi}'
    response = requests.get(base_url)
    if response.status_code == 200:
        data = response.json()
        return data['message']
    else:
        return None

total_articles = 2000
articles = []
progress_bar = tqdm(total=total_articles, desc='Fetching Articles')

while len(articles) < total_articles:
    doi = get_random_doi()
    if doi:
        article = get_article_info(doi)
        if article:
            articles.append(article)
            progress_bar.update(1)

progress_bar.close()

data = {
    'DOI': [],
    'Title': [],
    'Abstract': [],
    'Journal': [],
    'Field': [],
    'Citation Count': [],
    'Date Received': [],
    'Date Published': [],
    'Address': [],
    'Language': []
}

for article in articles:
    data['DOI'].append(article.get('DOI', ''))
    data['Title'].append(article.get('title', [''])[0])
    data['Abstract'].append(article.get('abstract', ''))
    data['Journal'].append(article.get('container-title', [''])[0])
    data['Field'].append(', '.join(article.get('subject', [])))
    data['Citation Count'].append(article.get('is-referenced-by-count', 0))
    data['Date Received'].append(article.get('created', ''))
    data['Date Published'].append(article.get('published-print', ''))
    data['Address'].append(', '.join(article.get('author', [])))
    data['Language'].append(article.get('language', ''))

df = pd.DataFrame(data)
print(df.head())




KeyboardInterrupt: 

In [44]:
import requests
import random
import pandas as pd
import time
from tqdm import tqdm  # Make sure to install the tqdm library

def get_random_doi():
    base_url = f'https://api.crossref.org/works/random'
    response = requests.get(base_url)
    if response.status_code == 200:
        data = response.json()
        doi = data['message']['DOI']
        return doi
    else:
        return None

def get_article_info(doi):
    base_url = f'https://api.crossref.org/works/{doi}'
    response = requests.get(base_url)
    if response.status_code == 200:
        data = response.json()
        return data['message']
    else:
        return None

total_articles = 2000
articles = []
doi_progress_bar = tqdm(total=total_articles, desc='Obtaining DOIs')

timeout = 10  # Set a timeout value in seconds

while len(articles) < total_articles:
    try:
        doi = get_random_doi()
        if doi:
            articles.append(doi)
            doi_progress_bar.update(1)
    except (requests.exceptions.Timeout, requests.exceptions.RequestException):
        pass  # Ignore timeout and other request exceptions, and continue

doi_progress_bar.close()

data = {
    'DOI': [],
    'Title': [],
    'Abstract': [],
    'Journal': [],
    'Field': [],
    'Citation Count': [],
    'Date Received': [],
    'Date Published': [],
    'Address': [],
    'Language': []
}

info_progress_bar = tqdm(articles, desc='Fetching Article Info')

requests_per_second = 0
start_time = time.time()

for doi in info_progress_bar:
    if requests_per_second >= 50:
        elapsed_time = time.time() - start_time
        if elapsed_time < 1:
            time.sleep(1 - elapsed_time)
        requests_per_second = 0
        start_time = time.time()

    article = get_article_info(doi)
    if article:
        data['DOI'].append(doi)
        data['Title'].append(article.get('title', [''])[0])
        data['Abstract'].append(article.get('abstract', ''))
        data['Journal'].append(article.get('container-title', [''])[0])
        data['Field'].append(', '.join(article.get('subject', [])))
        data['Citation Count'].append(article.get('is-referenced-by-count', 0))
        data['Date Received'].append(article.get('created', ''))
        data['Date Published'].append(article.get('published-print', ''))
        data['Address'].append(', '.join(article.get('author', [])))
        data['Language'].append(article.get('language', ''))

        requests_per_second += 1

info_progress_bar.close()

df = pd.DataFrame(data)
print(df.head())




Obtaining DOIs:   0%|          | 0/2000 [18:41<?, ?it/s]


KeyboardInterrupt: 

In [48]:
import requests

def get_articles_by_year(year):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': 1
    }

    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        return data.get('message', {}).get('items', [])
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(response.content)
        return []

year_to_test = 2021
articles = get_articles_by_year(year_to_test)

if articles:
    article = articles[0]
    for key, value in article.items():
        if isinstance(value, list):
            print(f"{key}:")
            for item in value:
                print(f"  - {item}")
        else:
            print(f"{key}: {value}")
else:
    print(f"No articles found for the year {year_to_test}.")


indexed: {'date-parts': [[2022, 4, 3]], 'date-time': '2022-04-03T01:46:03Z', 'timestamp': 1648950363641}
reference-count: 0
publisher: Book Publisher International (a part of SCIENCEDOMAIN International)
content-domain: {'domain': [], 'crossmark-restriction': False}
published-print: {'date-parts': [[2021, 7, 30]]}
DOI: 10.9734/bpi/hmms/v13/2889f
type: book-chapter
created: {'date-parts': [[2021, 8, 6]], 'date-time': '2021-08-06T10:42:23Z', 'timestamp': 1628246543000}
page: 108-114
source: Crossref
is-referenced-by-count: 0
title:
  - A Review on MVD for Trigeminal Neuralgia
prefix: 10.9734
author:
  - {'given': 'Renuka S.', 'family': 'Melkundi', 'sequence': 'first', 'affiliation': []}
  - {'given': 'Sateesh', 'family': 'Melkundi', 'sequence': 'additional', 'affiliation': []}
member: 4694
published-online: {'date-parts': [[2021, 7, 30]]}
container-title:
  - Highlights on Medicine and Medical Science Vol. 13
deposited: {'date-parts': [[2021, 8, 6]], 'date-time': '2021-08-06T10:42:35Z', 

In [2]:
import requests
import pandas as pd

rows = 1000

def get_articles_by_year(year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows  # Adjust this number to change the number of articles per year
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])
            
            if not items:
                break
            
            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

years_to_pull = [2020, 2021, 2022, 2023]
all_articles = []

for year in years_to_pull:
    articles = get_articles_by_year(year, rows = rows)  # Adjust this number for the desired articles per year
    all_articles.extend(articles)

data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': [],
    'Referenced By': []
}

for article in all_articles:
    doi = article.get('DOI', '')
    title = article.get('title', [''])[0]
    container_title = article.get('container-title', [''])[0]
    publisher = article.get('publisher', '')
    publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
    authors = article.get('author', [])
    referenced_by = article.get('is-referenced-by-count', 0)

    for order, author in enumerate(authors, start=1):
        first_name = author.get('given', '')
        last_name = author.get('family', '')
        
        data['DOI'].append(doi)
        data['Title'].append(title)
        data['Container Title'].append(container_title)
        data['Publisher'].append(publisher)
        data['Publish Date'].append(publish_date)
        data['Author First Name'].append(first_name)
        data['Author Last Name'].append(last_name)
        data['Author Order'].append(order)
        data['Referenced By'].append(referenced_by)

df = pd.DataFrame(data)
df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
print(df.shape)
df.head(20)

(11347, 9)


Unnamed: 0,DOI,Title,Container Title,Publisher,Publish Date,Author First Name,Author Last Name,Author Order,Referenced By
0,10.35366/99957,Qué debe llevar un resumen,Cirujano General,GRAPHIMEDIC SA DE CV,[2020],Abilene Cirenia,Escamilla Ortiz,1,0
1,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Jessé Rafael Bento,Lima,1,0
2,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Clayton dos Santos,Silva,2,0
3,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Romário Guimarães Verçosa,Araújo,3,0
4,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Jonas Olimpio de Lima,Silva,4,0
5,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Arlla Katherine Xavier,Lima,5,0
6,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],João Manoel,Silva,6,0
7,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Tania Marta Carvalho,Santos,7,0
8,10.34117/bjdv6n3-358,Perfil socioeconômico de mulheres feirantes do...,Brazilian Journal of Development,Brazilian Journal of Development,[2020],Jakes Halan de Queiroz,Costa,8,0
9,10.5194/egusphere-egu2020-1598,The storage and influencing factors of mercury...,,Copernicus GmbH,[],Jing,Gu,1,0


In [3]:
df.to_csv('D:/School/GitHub/SMU_MSDS_Capstone/main.csv')

# ATTEMPT GOOD EXPANDED

In [6]:
import requests
import pandas as pd

rows = 1000

def get_articles_by_year(year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows  # Adjust this number to change the number of articles per year
    }

    articles = []
    cursor = '*'

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])
            
            if not items:
                break
            
            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

years_to_pull = [2017,2018,2019,2020, 2021, 2022, 2023]
all_articles = []

for year in years_to_pull:
    articles = get_articles_by_year(year, rows = rows)  # Adjust this number for the desired articles per year
    all_articles.extend(articles)

data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': [],
    'Referenced By': []
}

for article in all_articles:
    doi = article.get('DOI', '')
    title = article.get('title', [''])[0]
    container_title = article.get('container-title', [''])[0]
    publisher = article.get('publisher', '')
    publish_date = article.get('published-print', {}).get('date-parts', [[]])[0]
    authors = article.get('author', [])
    referenced_by = article.get('is-referenced-by-count', 0)

    for order, author in enumerate(authors, start=1):
        first_name = author.get('given', '')
        last_name = author.get('family', '')
        
        data['DOI'].append(doi)
        data['Title'].append(title)
        data['Container Title'].append(container_title)
        data['Publisher'].append(publisher)
        data['Publish Date'].append(publish_date)
        data['Author First Name'].append(first_name)
        data['Author Last Name'].append(last_name)
        data['Author Order'].append(order)
        data['Referenced By'].append(referenced_by)

df = pd.DataFrame(data)
df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']
print(df.shape)
df.head(20)

(18465, 9)


Unnamed: 0,DOI,Title,Container Title,Publisher,Publish Date,Author First Name,Author Last Name,Author Order,Referenced By
0,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",L,Massaro,1,0
1,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",F,Ceccarelli,2,0
2,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",FR,Spinelli,3,0
3,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",F,Morello,4,0
4,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",C,Perricone,5,0
5,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",F,Miranda,6,0
6,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",S,Truglia,7,0
7,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",V,Orefice,8,0
8,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",IM,Rutigliano,9,0
9,10.1136/annrheumdis-2017-eular.4238,SAT0241 Early response to belimumab in sle-rel...,Poster Presentations,BMJ Publishing Group Ltd and European League A...,"[2017, 6]",C,Alessandri,10,0


# ATTEMPT BAD

In [119]:
import requests
import pandas as pd

rows = 1000  # Adjust this number for the desired articles per year

# Function to fetch articles for a given year
def get_articles_by_year(year, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows  
    }

    articles = []
    cursor = '*'  # Initialize cursor for pagination

    while cursor:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])
            
            if not items:
                break
            
            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor', '')  # Retrieve next cursor for pagination
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

    return articles

# List of years to pull articles for
years_to_pull = [2020, 2021, 2022, 2023]
all_articles = []

# Fetch articles for each year and add to the list
for year in years_to_pull:
    articles = get_articles_by_year(year, rows=rows)
    all_articles.extend(articles)

# Initialize data dictionary to store extracted data
data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': [],
    'Referenced By': []
}

# Extract data from each article and populate the data dictionary
for article in all_articles:
    # ... (data extraction logic as before)
    # Check 'published-print', 'published-online', and 'published' for date information
    publish_date_parts = None
    for key in ['published-print', 'published-online', 'published']:
        publish_date_parts = article.get(key, {}).get('date-parts', [])
        if publish_date_parts:
            break

    # If no publish date information is found, set it as an empty list
    if not publish_date_parts:
        publish_date_parts = [[]]

    # Use the first available date as the 'Publish Date'
    publish_date = publish_date_parts[0]

    # ... (rest of the data extraction)

    data['DOI'].append(doi)
    data['Title'].append(title)
    data['Container Title'].append(container_title)
    data['Publisher'].append(publisher)
    data['Publish Date'].append(publish_date)
    data['Author First Name'].append(first_name)
    data['Author Last Name'].append(last_name)
    data['Author Order'].append(order)
    data['Referenced By'].append(referenced_by)

# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)
df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']

In [121]:
# Grouping the DataFrame by 'Publish Date' year and counting unique DOIs
unique_doi_counts = df['Publish Date'].value_counts(dropna=False)

print(unique_doi_counts)


Publish Date
[2023]           243
[2021]           226
[2023, 3, 2]     212
[2022]           211
[2020]           200
                ... 
[2020, 6, 3]       1
[2021, 4, 19]      1
[2021, 4, 15]      1
[2020, 2, 11]      1
[2021, 2, 2]       1
Name: count, Length: 709, dtype: int64


In [122]:
import pandas as pd

# Assuming you have already created the DataFrame 'df' with the metadata

# Counting the number of null values in each column
null_counts = df.isna().sum()

print("Null counts in each column:")
print(null_counts)


Null counts in each column:
DOI                  0
Title                0
Container Title      0
Publisher            0
Publish Date         0
Author First Name    0
Author Last Name     0
Author Order         0
Referenced By        0
dtype: int64


In [123]:
print(df.shape)
print(df.head())

(4000, 9)
                            DOI  \
0  10.2991/978-2-494069-95-4_49   
1  10.2991/978-2-494069-95-4_49   
2  10.2991/978-2-494069-95-4_49   
3  10.2991/978-2-494069-95-4_49   
4  10.2991/978-2-494069-95-4_49   

                                               Title  \
0  Intercultural Communicative Competence (ICC): ...   
1  Intercultural Communicative Competence (ICC): ...   
2  Intercultural Communicative Competence (ICC): ...   
3  Intercultural Communicative Competence (ICC): ...   
4  Intercultural Communicative Competence (ICC): ...   

                             Container Title            Publisher  \
0  Journal of Language Teaching and Research  Academy Publication   
1  Journal of Language Teaching and Research  Academy Publication   
2  Journal of Language Teaching and Research  Academy Publication   
3  Journal of Language Teaching and Research  Academy Publication   
4  Journal of Language Teaching and Research  Academy Publication   

    Publish Date Author Fir

In [124]:
import os
import pandas as pd

# Specify the directory and file name for the CSV file
csv_dir = 'D:/School/GitHub/SMU_MSDS_Capstone'  # Change this to the desired directory
csv_file_name = 'metadata.csv'
csv_file_path = os.path.join(csv_dir, csv_file_name)

# Create the directory if it doesn't exist
os.makedirs(csv_dir, exist_ok=True)

# Save the DataFrame as a CSV file with UTF-8 encoding
df.to_csv(csv_file_path, index=False, encoding='utf-8')

print(f"DataFrame saved as '{csv_file_path}'")


DataFrame saved as 'D:/School/GitHub/SMU_MSDS_Capstone\metadata.csv'


# ATTEMPT DEAD

In [95]:
import requests
import pandas as pd
from tqdm import tqdm
import time

# Adjust this number for the desired articles per year
rows_per_request = 1000

def get_articles_by_year(year, rows=rows_per_request, cursor=None):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'cursor': cursor  # Include the cursor for pagination
    }

    articles = []
    while True:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])

            if not items:
                break

            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor')
            if not cursor:
                break
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

        # Sleep for a short duration to stay within rate limits
        time.sleep(0.5)

    return articles

years_to_pull = [2020, 2021, 2022, 2023]
all_articles = []

# Loop through each year
for year in years_to_pull:
    cursor = None  # Initialize cursor for pagination
    # Loop until all pages of articles are retrieved
    while True:
        articles = get_articles_by_year(year, rows=rows_per_request, cursor=cursor)
        if not articles:
            break
        all_articles.extend(articles)
        # Update the cursor for the next page of articles
        cursor = articles[-1].get('message', {}).get('next-cursor')
        if not cursor:
            break
        # Sleep for a short duration to stay within rate limits
        time.sleep(0.5)

# Initialize data dictionary to store extracted data
data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Referenced By': []
}

# Extract data from each article and populate the data dictionary
for article in tqdm(all_articles, desc="Processing Articles"):
    # ... (data extraction logic as before)

    data['DOI'].append(doi)
    data['Title'].append(title)
    data['Container Title'].append(container_title)
    data['Publisher'].append(publisher)
    data['Publish Date'].append(publish_date)
    data['Referenced By'].append(referenced_by)

# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)

# Extract authors as a list of dictionaries
authors_list = [article.get('author', []) for article in all_articles]
author_columns = []
for i, authors in enumerate(authors_list):
    author_data = {
        f'AuthorFirstName_{j + 1}': author.get('given', ''),
        f'AuthorLastName_{j + 1}': author.get('family', '')
    }
    author_columns.append(author_data)

# Concatenate all author columns at once
author_df = pd.concat([pd.DataFrame(ac) for ac in author_columns], axis=1)

# Reorganize the DataFrame to vertical format
vertical_data = {
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': []
}
for i in range(1, max(len(authors) for authors in authors_list) + 1):
    vertical_data['Author First Name'].extend(author_df[f'AuthorFirstName_{i}'])
    vertical_data['Author Last Name'].extend(author_df[f'AuthorLastName_{i}'])
    vertical_data['Author Order'].extend([i] * len(df))

vertical_df = pd.DataFrame(vertical_data)

# Save the vertical DataFrame as a CSV file (adjust the file path accordingly)
csv_file_path = 'metadata_vertical.csv'
vertical_df.to_csv(csv_file_path, index=False)
print(f"Vertical DataFrame saved as '{csv_file_path}'")


Processing Articles: 100%|██████████| 4000/4000 [00:00<00:00, 1998239.16it/s]
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = author.get('family', '')
  df[f'AuthorFirstName_{j + 1}'] = author.get('given', '')
  df[f'AuthorLastName_{j + 1}'] = au

Vertical DataFrame saved as 'metadata_vertical.csv'


# ATTEMPT DEAD

In [96]:
import requests
import pandas as pd
from tqdm import tqdm
import time

# Adjust this number for the desired articles per year
rows_per_request = 1000


In [97]:
def get_articles_by_year(year, rows=rows_per_request, cursor=None):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'cursor': cursor  # Include the cursor for pagination
    }

    articles = []
    while True:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('message', {}).get('items', [])

            if not items:
                break

            articles.extend(items)
            cursor = data.get('message', {}).get('next-cursor')
            if not cursor:
                break
        else:
            print(f"Request failed with status code: {response.status_code}")
            print(response.content)
            break

        # Sleep for a short duration to stay within rate limits
        time.sleep(0.5)

    return articles


In [98]:
years_to_pull = [2020, 2021, 2022, 2023]
all_articles = []

# Loop through each year
for year in years_to_pull:
    cursor = None  # Initialize cursor for pagination
    # Loop until all pages of articles are retrieved
    while True:
        articles = get_articles_by_year(year, rows=rows_per_request, cursor=cursor)
        if not articles:
            break
        all_articles.extend(articles)
        # Update the cursor for the next page of articles
        cursor = articles[-1].get('message', {}).get('next-cursor')
        if not cursor:
            break
        # Sleep for a short duration to stay within rate limits
        time.sleep(0.5)


In [99]:
# Initialize data dictionary to store extracted data
data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Referenced By': []
}

# Extract data from each article and populate the data dictionary
for article in tqdm(all_articles, desc="Processing Articles"):
    # ... (data extraction logic as before)

    data['DOI'].append(doi)
    data['Title'].append(title)
    data['Container Title'].append(container_title)
    data['Publisher'].append(publisher)
    data['Publish Date'].append(publish_date)
    data['Referenced By'].append(referenced_by)



Processing Articles: 100%|██████████| 4000/4000 [00:00<00:00, 1998477.19it/s]


In [100]:
# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)

# Extract authors as a list of dictionaries
authors_list = [article.get('author', []) for article in all_articles]
author_columns = []
for i, authors in enumerate(authors_list):
    author_data = {
        f'AuthorFirstName_{j + 1}': author.get('given', ''),
        f'AuthorLastName_{j + 1}': author.get('family', '')
    }
    author_columns.append(author_data)


In [116]:
df.head()

Unnamed: 0,DOI,Title,Container Title,Publisher,Publish Date,Author First Name,Author Last Name,Author Order,Referenced By
0,10.2991/978-2-494069-95-4_49,Intercultural Communicative Competence (ICC): ...,Journal of Language Teaching and Research,Academy Publication,[2023],Narathip,Thumawongsa,2,0
1,10.2991/978-2-494069-95-4_49,Intercultural Communicative Competence (ICC): ...,Journal of Language Teaching and Research,Academy Publication,[2023],Narathip,Thumawongsa,2,0
2,10.2991/978-2-494069-95-4_49,Intercultural Communicative Competence (ICC): ...,Journal of Language Teaching and Research,Academy Publication,[2023],Narathip,Thumawongsa,2,0
3,10.2991/978-2-494069-95-4_49,Intercultural Communicative Competence (ICC): ...,Journal of Language Teaching and Research,Academy Publication,[2023],Narathip,Thumawongsa,2,0
4,10.2991/978-2-494069-95-4_49,Intercultural Communicative Competence (ICC): ...,Journal of Language Teaching and Research,Academy Publication,[2023],Narathip,Thumawongsa,2,0


In [102]:
# Create a list of dictionaries for authors
authors_data = []

for authors in authors_list:
    author_data = {}
    for j, author in enumerate(authors, start=1):
        author_data[f'AuthorFirstName_{j}'] = author.get('given', '')
        author_data[f'AuthorLastName_{j}'] = author.get('family', '')
    authors_data.append(author_data)

# Create a DataFrame from the list of author dictionaries
author_df = pd.DataFrame(authors_data)


In [115]:
author_df.head()

Unnamed: 0,AuthorFirstName_1,AuthorLastName_1,AuthorFirstName_2,AuthorLastName_2,AuthorFirstName_3,AuthorLastName_3,AuthorFirstName_4,AuthorLastName_4,AuthorFirstName_5,AuthorLastName_5,...,AuthorFirstName_92,AuthorLastName_92,AuthorFirstName_93,AuthorLastName_93,AuthorFirstName_94,AuthorLastName_94,AuthorFirstName_95,AuthorLastName_95,AuthorFirstName_96,AuthorLastName_96
0,,,,,,,,,,,...,,,,,,,,,,
1,Abilene Cirenia,Escamilla Ortiz,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,Jessé Rafael Bento,Lima,Clayton dos Santos,Silva,Romário Guimarães Verçosa,Araújo,Jonas Olimpio de Lima,Silva,Arlla Katherine Xavier,Lima,...,,,,,,,,,,
4,Jing,Gu,Qiaotong,Pang,Jinzhi,Ding,Runsheng,Yin,Yuanhe,Yang,...,,,,,,,,,,


In [118]:
# Reorganize the DataFrame to vertical format
vertical_data = {
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': []
}
for i in range(1, max(len(authors) for authors in authors_list) + 1):
    vertical_data['Author First Name'].extend(author_df[f'AuthorFirstName_{i}'])
    vertical_data['Author Last Name'].extend(author_df[f'AuthorLastName_{i}'])
    vertical_data['Author Order'].extend([i] * len(df))

vertical_df = pd.DataFrame(vertical_data)


ValueError: array length 384000 does not match index length 4000

In [104]:
# Save the vertical DataFrame as a CSV file (adjust the file path accordingly)
csv_file_path = 'metadata_vertical.csv'
vertical_df.to_csv(csv_file_path, index=False)
print(f"Vertical DataFrame saved as '{csv_file_path}'")


Vertical DataFrame saved as 'metadata_vertical.csv'


# ATTEMPT

In [112]:
import requests
import pandas as pd

rows = 1000  # Adjust this number for the desired articles per year

# Function to fetch articles for a given year and cursor
def get_articles_by_year(year, cursor=None, rows=rows):
    base_url = 'https://api.crossref.org/works'
    params = {
        'filter': f'from-pub-date:{year}-01-01,until-pub-date:{year}-12-31',
        'sort': 'relevance',
        'rows': rows,
        'cursor': cursor
    }

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        items = data.get('message', {}).get('items', [])
        next_cursor = data.get('message', {}).get('next-cursor', None)
        return items, next_cursor
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(response.content)
        return [], None

# List of years to pull articles for
years_to_pull = [2020, 2021, 2022, 2023]
all_articles = []
unique_dois = set()  # To track unique DOIs

# Fetch articles for each year and add to the list
for year in years_to_pull:
    cursor = None
    while True:
        articles, cursor = get_articles_by_year(year, cursor, rows=rows)
        if not articles:
            break
        for article in articles:
            doi = article.get('DOI', '')
            if doi not in unique_dois:
                all_articles.append(article)
                unique_dois.add(doi)
        if cursor is None:
            break
    print(f"Total unique DOIs after {year}: {len(unique_dois)}")

# Initialize data dictionary to store extracted data
data = {
    'DOI': [],
    'Title': [],
    'Container Title': [],
    'Publisher': [],
    'Publish Date': [],
    'Author First Name': [],
    'Author Last Name': [],
    'Author Order': [],
    'Referenced By': []
}

# Extract data from each article and populate the data dictionary
for article in all_articles:
    # ... (data extraction logic as before)

    # ... (rest of the data extraction)

    data['DOI'].append(doi)
    data['Title'].append(title)
    data['Container Title'].append(container_title)
    data['Publisher'].append(publisher)
    data['Publish Date'].append(publish_date)
    data['Author First Name'].append(first_name)
    data['Author Last Name'].append(last_name)
    data['Author Order'].append(order)
    data['Referenced By'].append(referenced_by)

# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)
df.columns = ['DOI', 'Title', 'Container Title', 'Publisher', 'Publish Date', 'Author First Name', 'Author Last Name', 'Author Order', 'Referenced By']

# Save the DataFrame to a CSV file
df.to_csv('article_data.csv', index=False)


Total unique DOIs after 2020: 1000
Total unique DOIs after 2021: 2000
Total unique DOIs after 2022: 3000
Total unique DOIs after 2023: 4000
