## Import libraries

In [1]:
import json
import requests
import pandas as pd

## Set variables

In [8]:
BASE_URL = "https://api.figshare.com/v2/"
INST_ID = "231" #Example INST_ID = "658"

## Retrieve Metadata
1. Retrieve basic metadata
2. Pull out a list of article ids
3. Use those ids to retrieve all metadata fields for each article
4. Convert the resulting JSON to a dataframe
5. Save the dataframe to an excel file

In [9]:
#Gather basic metadata for all items (articles) from the Figshare articles API endpoint
articles = []
for i in range(1,10):
    ids = json.loads(requests.get(BASE_URL + "articles?institution="+INST_ID+"&page_size=1000&page={}".format(i)).content)
    articles.extend(ids)

In [11]:
json.loads(requests.get(BASE_URL + "articles?institution="+INST_ID+"&page_size=1000&page={}".format(3)).content)

[{'id': 14119703,
  'title': "What's It To You? A Survey of Online Privacy Concerns and Risks",
  'doi': '10.1184/r1/14119703.v1',
  'handle': '',
  'url': 'https://api.figshare.com/v2/articles/14119703',
  'published_date': '2021-03-04T21:31:48Z',
  'thumb': 'https://s3-eu-west-1.amazonaws.com/ppreviews-cmu-49810598590254/26627585/thumb.png',
  'defined_type': 18,
  'defined_type_name': 'report',
  'group_id': 9948,
  'url_private_api': 'https://api.figshare.com/v2/account/articles/14119703',
  'url_public_api': 'https://api.figshare.com/v2/articles/14119703',
  'url_private_html': 'https://figshare.com/account/articles/14119703',
  'url_public_html': 'https://kilthub.cmu.edu/articles/report/What_s_It_To_You_A_Survey_of_Online_Privacy_Concerns_and_Risks/14119703',
  'timeline': {'posted': '2021-03-04T21:31:48',
   'publisherPublication': '2006-10-01T00:00:00',
   'firstOnline': '2021-03-04T21:31:48'},
  'resource_title': "What's It To You? A Survey of Online Privacy Concerns and Risks

In [10]:
#See the number of articles
len(articles)

9000

In [None]:
#Create a list of all the article ids
article_ids = [item['id'] for item in articles]

In [None]:
#For each id in the article id list, retrieve all the metadata for the article by visiting the Figshare article API endpoint
#This may take a while- for example, 6,000 records takes about 1.5 hours
full_articles = []
for art_id in article_ids:
    article = json.loads(requests.get(BASE_URL + "articles/{}".format(art_id)).content)
    full_articles.append(article)

In [None]:
#See the number of articles
len(full_articles)

In [None]:
#View the metadata for the first article in JSON format
full_articles[0]

In [None]:
#Create a dataframe from the JSON formatted data
df = pd.DataFrame(full_articles)

## Split out custom metadata fields
If a field does not exist for an item, it will show NaN (i.e. null).
1. Create a dataframe of custom metadata fields
2. Merge that dataframe with the original metadata dataframe
3. Save to an excel file

In [None]:
#The custom fields are all contained within one column called 'custom_fields'. Flatten that column and associate the values
#with the proper article id
custom = pd.json_normalize(
    full_articles,
    record_path =['custom_fields'],
    meta=['id']
)
#This reshapes the data so that metadata field names are columns and each row is an id.
custom = custom.pivot(index="id", columns="name", values="value")

In [None]:
#Merge the dataframes so that all the custom fields are visible along with all the other metadata
custom_split_out = df.merge(custom, how='inner', on='id')

# Download Metadata

## If you are running this in Google Colab

In [None]:
#When you run this cell it will ask you to authenticate so that you can create files to download
from google.colab import drive
drive.mount('/drive')

In [None]:
from google.colab import files
custom_split_out.to_csv('public-metadata-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8') #create the CSV
files.download('public-metadata-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv') #download to your computer

## If you are running this locally
That is you downloaded the Jupyter Notebook

In [None]:
#Save a file of all the metadata with the custom fields split out.
save_file = custom_split_out.to_excel("metadata-custom-fields-split-out.xlsx")