# Fetch SEDLS presenter materials from the OSF repository

These scripts were originally created to streamline the process of accessing URL links to presentation resources stored in OSF for incorporation into the Program page on the SEDLS website (e.g., [the links to materials on the 2021 SEDLS Program page](https://se-datalibrarian.github.io/2021/))

In [164]:
import requests
import pandas as pd
from typing import Dict, List

In [160]:
def fetch_with_pagination(path: str) -> List[Dict]:
    '''
    Fetch the specified OSF API resource based on the provided "path" and return a list of Dictionary objects containing responses for each page of data from a collection using pagination.

    Parameters
    ----------
    path: str
        The URL path pointing to the OSF API collection of interest

    Returns
    -------
    List[Dict]
        A list of Dictionaries with each item containing the data from one page of data from a collection
    '''

    page_json_list: List[Dict] = [requests.get(path).json()]
    if page_json_list[0]['links']['next'] is not None:
        page_json_list += fetch_with_pagination(page_json_list[0]['links']['next'])
    return page_json_list
    

In [158]:
def fetch_provider_paths(node_id: str) -> List[str]:
    '''
    Gets a ist of the URL paths for file storage providers in an OSF project. Use this list to fetch information on files stored in each provider.

    Parameters
    ----------
    node_id: str
        The node id of the project for which to get the URL paths

    Returns
    -------
    List[str]
        A list of strings of URL paths for file storage providers
    '''

    node_path = f'https://api.osf.io/v2/nodes/{node_id}/files/'
    providers_data = fetch_with_pagination(node_path)
    return [
        path['relationships']['files']['links']['related']['href']
        for provider in providers_data for path in provider['data']
    ]

In [151]:
def fetch_OSF_files(list_of_file_provider_paths: List[str]) -> List[Dict]:
    '''Gets a list of all files in an OSF project
    
    Parameters
    ----------
    list_of_file_provider_paths: List[str]
        A list of URL paths pointing to the storage providers in the OSF repository

    Returns
    -------
    List[Dict]
        A list of dictionaries containing information about each file in the OSF repository
    '''

    # Fetch the contents within each provider's storage (files and folders)
    provider_contents_pages: List[str] = [
        fetch_with_pagination(path) for path in list_of_file_provider_paths
    ]

    # Loop through the contents of each page of each provider and identify if
    # the content type is a file or folder. If it is a file, append it to the
    # files list. If it is a folder, recursively call the "fetch_OSF_files"
    # function until files are returned and concatenate the resulting list to
    # the files list.
    files: List[Dict] = []
    for pages in provider_contents_pages:
        for data in pages:
            for attributes in data['data']:
                if attributes['attributes']['kind'] == 'folder':
                    folder_files = fetch_OSF_files(
                        [
                            attributes['relationships']['files']['links']['related']['href']
                        ]
                    )
                    files += folder_files
                elif attributes['attributes']['kind'] == 'file':
                    files.append(
                        {
                            'id': attributes['attributes']['guid'],
                            'name': attributes['attributes']['name'],
                            'path': attributes['attributes']['materialized_path']
                        }
                    )
    return files

In [162]:
# ID obtained from root page of project: https://osf.io/pyscb/
sedls21_osf_id = 'pyscb'
# Fetch the paths for the storage providers in the SEDLS 2021 OSF project
sedls21_providers_paths = fetch_provider_paths(sedls21_osf_id)
# Fetch the files in the SEDLS 2021 OSF project
sedls21_files = fetch_OSF_files(sedls21_providers_paths)

In [168]:
# Add URL to presentation materials to list of objects
for materials in sedls21_files:
    materials['url'] = f"https://osf.io/{materials['id']}"

In [169]:
# Create a pandas DataFrame from the files Dictionary
sedls21_files_df = pd.DataFrame(sedls21_files)
sedls21_files_df

Unnamed: 0,id,name,path,url
0,gjwy8,7_Bauder_Data-literacy-as-critical-information...,/Poster presentations/7_Bauder_Data-literacy-a...,https://osf.io/gjwy8
1,pwv2d,1_Smith-and-Davis_Escape-data-horror_Creating-...,/Poster presentations/1_Smith-and-Davis_Escape...,https://osf.io/pwv2d
2,rtz7y,5_Castro-et-al_dLOC-as-data.pdf,/Poster presentations/5_Castro-et-al_dLOC-as-d...,https://osf.io/rtz7y
3,uc5j2,2_Mckenna-Foster_ETData.pdf,/Poster presentations/2_Mckenna-Foster_ETData.pdf,https://osf.io/uc5j2
4,vgfu8,4_Klein-Carlton_iNaturalist-for-data-literacy.pdf,/Poster presentations/4_Klein-Carlton_iNatural...,https://osf.io/vgfu8
5,rkn2w,6-Holsapple_Data-sharing-for-Masters-theses.pdf,/Poster presentations/6-Holsapple_Data-sharing...,https://osf.io/rkn2w
6,bwqfm,9-Kechner_Propel-UVA-library-working-with-care...,/Poster presentations/9-Kechner_Propel-UVA-lib...,https://osf.io/bwqfm
7,z49xu,8_Dolan_Visualizing_Global_Tweet_Activity_for_...,/Poster presentations/8_Dolan_Visualizing_Glob...,https://osf.io/z49xu
8,9wjkn,3_Jones-Parker_Analyzing-UVA-Health-Publicatio...,/Poster presentations/3_Jones-Parker_Analyzing...,https://osf.io/9wjkn
9,n5723,Rockwell - Public Interest Data Literacy (PIDL...,/Presentation slides/Day 2 20211014/Rockwell -...,https://osf.io/n5723


In [170]:
# Save a CSV file for easy assimilation into spreadsheet of program data
sedls21_files_df.to_csv('sedls21_osf_files.csv')