## Get (Relevant) Scripts

In [1]:
#!pip install -U "pyDataverse==0.2.1"
import os
import requests
from pyDataverse.api import Api
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import logging

In [2]:
with open('token.txt', 'r') as f:
    token = f.read()

In [3]:
api = Api('https://dataverse.harvard.edu/', token)
api.status

'OK'

In [4]:
def extract_files_info(file_name, df):
    files = []
    for i, r in df.iterrows():
        doi = r.persistentUrl.replace('https://doi.org/', 'doi:')
        dataset = api.get_dataset(doi)
        if dataset.status_code == 200:
            j = dataset.json()
            if 'latestVersion' in j['data']:
                for file in j['data']['latestVersion']['files']:
                    fid = file['dataFile']['id']
                    fn = file['dataFile']['filename']
                    if fn.endswith(('.R', '.py', '.do')):
                        files.append({'doi': doi, 'fid': fid, 'fn': fn})

    # Convert the list of dictionaries to a DataFrame
    files_df = pd.DataFrame(files)
    
    if not os.path.exists('files_dfs'):
        os.makedirs('files_dfs')
        
    # Write the DataFrame to a CSV file
    files_df.to_csv(f'files_dfs/{file_name}_files.csv', index=False)

In [5]:
def read_csv_file(file_path):
    # Get the filename without the path and extension
    file_name = os.path.splitext(os.path.basename(file_path))[0]

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    df = df.dropna(subset = ['persistentUrl'])

    return file_name, df

In [6]:
# Get a list of all files in the "datasets" folder
datasets_folder = 'datasets'
all_files = os.listdir(datasets_folder)

# Iterate over each file in the datasets folder
for file in tqdm(all_files, desc="Processing files", unit="file"):
    if file.endswith('.csv'):
        file_path = os.path.join(datasets_folder, file)
        outs = read_csv_file(file_path)
        extract_files_info(outs[0], outs[1])

Processing files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 76/76 [1:57:30<00:00, 92.77s/file]


### Download Files

In [7]:
def download_file(url, fn):
    local_filename = fn # url.split('/')[-1]
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192): 
                    if chunk: # filter out keep-alive new chunks
                        f.write(chunk)
        return local_filename
    except Exception as e:
        logging.error(f"Failed to download file: {url}, Error: {str(e)}")
        return None

In [8]:
def download_file_wrapper(args):
    fid, fn, doi, token, file_name = args
    url = 'https://dataverse.harvard.edu/api/v1/access/datafile/%s?key=%s' % (fid, token)
    path = os.path.join('scripts', file_name, doi.split('/')[-1])
    
    try:
        os.makedirs(path)
    except FileExistsError:
        pass  # Skip creating the directory if it already exists
    
    lfn = os.path.join(path, fn)
    if not os.path.exists(lfn):
        download_file(url, lfn)
        
def download_files(file_path, num_workers=4):
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    df = pd.read_csv(file_path)
    file_list = [(f['fid'], f['fn'], f['doi'], token, file_name) for _, f in df.iterrows()]

    with Pool(num_workers) as pool:
        pool.map(download_file_wrapper, file_list)

In [9]:
# Get a list of all files in the "datasets" folder
dataset_files_folder = 'files_dfs'
all_files = os.listdir(dataset_files_folder)

# Iterate over each file in the folder with tqdm
for file in tqdm(all_files, desc="Processing Files", unit="file"):
    if file.endswith('.csv'):
        file_path = os.path.join(dataset_files_folder, file)
        try:
            download_files(file_path)
        except pd.errors.EmptyDataError:
            logging.error(f"Ignoring {file} as it is empty.")

Processing Files:   0%|                                                                                                             | 0/76 [00:00<?, ?file/s]ERROR:root:Ignoring PittJWSR_datasets_files.csv as it is empty.
Processing Files:  11%|██████████▋                                                                                          | 8/76 [00:01<00:09,  7.01file/s]ERROR:root:Ignoring NegotiationJournal_datasets_files.csv as it is empty.
Processing Files:  16%|███████████████▊                                                                                    | 12/76 [00:01<00:08,  7.69file/s]ERROR:root:Ignoring regionalstatistics_datasets_files.csv as it is empty.
ERROR:root:Ignoring joad_datasets_files.csv as it is empty.
Processing Files:  29%|████████████████████████████▉                                                                       | 22/76 [00:02<00:08,  6.43file/s]ERROR:root:Ignoring dib_datasets_files.csv as it is empty.
Processing Files:  39%|██████████████████