In [1]:
#!pip install -U "pyDataverse==0.2.1"

## Dataverse Cites

In [2]:
import os
import requests
from pyDataverse.api import Api
import pandas as pd
from tqdm import tqdm

In [3]:
def download_file(url, fn):
    local_filename = fn # url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    # f.flush()
    return local_filename

In [12]:
with open('token.txt', 'r') as f:
    contents = f.read()

In [13]:
token = contents

In [14]:
# Set the base URL of your Dataverse instance
base_url = 'https://dataverse.harvard.edu/api'

# Set the Dataverse alias or identifier
dataverse_id = 'jop'

# Construct the API endpoint URL
url = f'{base_url}/dataverses/{dataverse_id}/contents'

# Make a GET request to retrieve the dataset information
response = requests.get(url)
dataset_json = response.json()
df = pd.DataFrame(dataset_json['data'])
df.to_csv("jop_datasets.csv", index = False)
df.head()

Unnamed: 0,id,identifier,persistentUrl,protocol,authority,publisher,publicationDate,storageIdentifier,type
0,66299,DVN/29108,https://doi.org/10.7910/DVN/29108,doi,10.791,Harvard Dataverse,2015-02-13,s3://10.7910/DVN/29108,dataset
1,66300,DVN/29566,https://doi.org/10.7910/DVN/29566,doi,10.791,Harvard Dataverse,2015-03-23,s3://10.7910/DVN/29566,dataset
2,66301,DVN/29446,https://doi.org/10.7910/DVN/29446,doi,10.791,Harvard Dataverse,2015-03-10,s3://10.7910/DVN/29446,dataset
3,66302,DVN/29101,https://doi.org/10.7910/DVN/29101,doi,10.791,Harvard Dataverse,2015-02-12,s3://10.7910/DVN/29101,dataset
4,66303,DVN/29487,https://doi.org/10.7910/DVN/29487,doi,10.791,Harvard Dataverse,2015-03-17,s3://10.7910/DVN/29487,dataset


In [6]:
api = Api('https://dataverse.harvard.edu/', token)
api.status

'OK'

In [8]:
files = []
for i, r in tqdm(df.iterrows(), total=df.shape[0]):
    doi = r.persistentUrl.replace('https://doi.org/', 'doi:')
    #print(doi)
    dataset = api.get_dataset(doi)
    if dataset.status_code == 200:
        j = dataset.json()
        if 'latestVersion' in j['data']:
            for file in j['data']['latestVersion']['files']:
                fid = file['dataFile']['id']
                fn = file['dataFile']['filename']
                if fn.endswith('.R'):
                    #print(fid, fn)
                    files.append({'doi': doi, 'fid': fid, 'fn': fn})
            

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 826/826 [08:55<00:00,  1.54it/s]


In [9]:
fdf = pd.DataFrame(files)
fdf

Unnamed: 0,doi,fid,fn
0,doi:10.7910/DVN/29566,2547198,Replication1.R
1,doi:10.7910/DVN/29566,2547197,Replication2.R
2,doi:10.7910/DVN/29205,2539124,CCOA regs and graphs 2014.R
3,doi:10.7910/DVN/28930,2531720,Hainmueller_Hall_Snyder_replication.R
4,doi:10.7910/DVN/29314,2543064,analysis_main.R
...,...,...,...
1984,doi:10.7910/DVN/ZA6YEN,7211863,kitagawa_shenbayh_main_analysis_replication.R
1985,doi:10.7910/DVN/7PCO1L,7235222,build_data.R
1986,doi:10.7910/DVN/7PCO1L,7235257,figure_a3_table_a9.R
1987,doi:10.7910/DVN/7PCO1L,7235258,table_a16.R


In [10]:
for f in tqdm(files, total=len(files)):
    fid = f['fid']
    fn = f['fn']
    doi = f['doi']
    #print(doi, fn)
    url = 'https://dataverse.harvard.edu//api/v1/access/datafile/%s?key=%s' % (fid, token)
    path = os.path.join('output', doi.split('/')[-1])
    if not os.path.exists(path):
        os.makedirs(path)
    lfn = os.path.join(path, fn)
    if not os.path.exists(fn):
        download_file(url, lfn)
    #break

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1989/1989 [34:17<00:00,  1.03s/it]
