# Stream large relational data from figshare

In [None]:
'''
Figshare API convenience functions.
Modified from code provided by Daniel Gavrila
Requires Python 3.6+ due to the use of f-strings.
Logging is now implemented.
'''

In [1]:
import hashlib
import logging
import json
import os
import requests
from requests.exceptions import HTTPError

### Set variables
Change the values as appropriate

In [2]:
BASE_URL = 'https://api.figshare.com/v2/{endpoint}'
TOKEN = 'd8ad9b040c0c6b28b147ecc356e769374a8d0a4b628c832a1627e4eaa6c3ec4ace829420cc91908c2e60d4df657649d6d4995bf247a6fca2e16d4e4bc92f3a86'
CHUNK_SIZE = 10485760 #about 10MB
COLLECTION_ID = 6076908

### Define the functions

In [None]:
## Namespace

class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [3]:
# define functions to make api request

## methods

def printerr(String, *args, **kwargs):
    print(String, sys.stderr, *args, **kwargs)

def logthis(msg,logger=None):
    if logger:
        logger.info(msg)
    else:
        print(msg)

def raw_issue_request(method, url, data=None, binary=False, logger=None):
    if data is not None and not binary:
        data = json.dumps(data)
    response = requests.request(method, url, data=data)
    try:
        response.raise_for_status()
        try:
            data = json.loads(response.content)
        except ValueError:
            data = response.reason
    except HTTPError as error:
        if logger:
            logger.error(f'Caught an HTTPError: {error.response.status_code}')
            logger.error('Body:\n' + response.reason)
        else:
            print('Caught an HTTPError: {}'.format(error.response.status_code))
            print('Body:\n', response.reason)
        raise

    return data

def issue_request(method, endpoint, logger=None, *args, **kwargs):
    return raw_issue_request(method, BASE_URL.format(endpoint=endpoint), logger=logger, *args, **kwargs)

def get_collection_articles(collection_id=COLLECTION_ID, page=1, page_size=100, logger=None):
    endpoint = f'collections/{collection_id}/articles'
    logthis(f'Listing articles for {collection_id}.')
    result = True
    output = []
    while result:
        endpoint_curr = f'{endpoint}?page={page}&page_size={page_size}'
        result = issue_request('GET', endpoint_curr, logger=logger)
        if result:
            output += result
            page += 1
    return output

def get_article_info(article_id, logger=None):
    endpoint = f'articles/{article_id}'
    logthis(f'Getting info for article {article_id}.')
    result = issue_request('GET', endpoint, logger=logger)
    return result

# move this to other script!

In [4]:
import requests
import pandas as pd
from tqdm import tqdm  # Import the tqdm module

In [5]:
get_collection_articles()

Listing articles for 6076908.


[{'id': 21896802,
  'title': 'SciSciNet_Authors_Gender',
  'doi': '10.6084/m9.figshare.21896802.v1',
  'handle': '',
  'url': 'https://api.figshare.com/v2/articles/21896802',
  'published_date': '2023-05-18T14:42:20Z',
  'thumb': '',
  'defined_type': 3,
  'defined_type_name': 'dataset',
  'group_id': 2826,
  'url_private_api': 'https://api.figshare.com/v2/account/articles/21896802',
  'url_public_api': 'https://api.figshare.com/v2/articles/21896802',
  'url_private_html': 'https://figshare.com/account/articles/21896802',
  'url_public_html': 'https://springernature.figshare.com/articles/dataset/SciSciNet_Authors_Gender/21896802',
  'timeline': {'posted': '2023-05-18T14:42:20',
   'firstOnline': '2023-05-18T14:42:20'},
  'resource_title': None,
  'resource_doi': None},
 {'id': 20220093,
  'title': 'SciSciNet_Fields',
  'doi': '10.6084/m9.figshare.20220093.v1',
  'handle': '',
  'url': 'https://api.figshare.com/v2/articles/20220093',
  'published_date': '2023-05-18T14:42:20Z',
  'thumb'

In [6]:
# access download url 
url = get_article_info(20220093).get('files')[0]['download_url']
print(url)

Getting info for article 20220093.
https://ndownloader.figshare.com/files/36222114


In [7]:
def download_data(article_id):
    url = get_article_info(article_id).get('files')[0]['download_url']
    response = requests.get(url, stream=True)  # Use stream=True to download content incrementally
    
    try:
        # Get the total content length from the response headers
        total_size = int(response.headers.get('content-length', 0))
        
        # Create a tqdm progress bar
        progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)
        
        # Open a file to write the downloaded content
        with open('downloaded_data.tsv', 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
                    progress_bar.update(len(chunk))  # Update the progress bar

        # Close the progress bar
        progress_bar.close()
        
        # Read the downloaded TSV file into a DataFrame
        df = pd.read_csv('downloaded_data.tsv', sep='\t')
        return df
    
    except Exception as e:
        print("An error occurred:", e)
        return None


In [20]:
df = download_data(20220093)
print(df)

Getting info for article 20220093.


100%|██████████████████████████████████████| 9.32k/9.32k [00:00<00:00, 5.32MB/s]

       FieldID               Field_Name Field_Type
0      3079626  Quantum electrodynamics        Sub
1     37914503     Mathematical physics        Sub
2    159047783                 Virology        Sub
3     70410870      Clinical psychology        Sub
4    187212893               Pediatrics        Sub
..         ...                      ...        ...
306   70721500    Computational biology        Sub
307   42360764     Chemical engineering        Sub
308  178550888  Business administration        Sub
309   13736549   Industrial engineering        Sub
310   47768531    Development economics        Sub

[311 rows x 3 columns]





In [22]:
search_term = 'science'

filtered_df = df[df['Field_Name'].str.contains(search_term, case=False)]

In [23]:
print(filtered_df)

        FieldID                    Field_Name Field_Type
12     31903555                  Food science        Sub
39    148383697              Regional science        Sub
49      1965285                 Earth science        Sub
59     37621935          Agricultural science        Sub
87    192562407             Materials science        Top
93     36289849                Social science        Sub
118   161191863               Library science        Sub
124   140793950                Animal science        Sub
137      459310         Computational science        Sub
152    41008148              Computer science        Top
157   169760540                  Neuroscience        Sub
160   162118730             Actuarial science        Sub
165   188147891             Cognitive science        Sub
167   159390177                  Soil science        Sub
171    17744445             Political science        Top
177   539667460            Management science        Sub
197    80444323  Theoretical co

CONTINUE THIS HERE LATER