In [1]:
## Download Zenodo Repo.

In [30]:
import git
import requests
import os
import pandas as pd
import zenodo_get as zget
import tempfile
import zipfile
from zipfile import ZipFile
from urllib.request import urlretrieve

In [10]:
with open('zenodo_token.txt', 'r') as f:
    zenodo_token = f.read()

### Zenodo 'Community'

In [13]:
def search_zenodo(community, access_token, size=10, page=1):
    base_url = 'https://zenodo.org/api/records'

    try:
        # Search for records with the specified title query
        params = {
            'access_token': access_token,
            'communities': community,
            'size': size,
            'page': page
        }

        response = requests.get(base_url, params=params)
        response.raise_for_status()

    except requests.exceptions.RequestException as e:
        print(f"Error searching Zenodo records: {e}")

    return response.json()

def get_all_records(community, access_token, page_size=100, max_pages=None):
    all_records = []
    page_number = 1

    while True:
        result = search_zenodo(community, access_token, size=page_size, page=page_number)

        if not result['hits']['hits']:
            # No more records
            break

        all_records.extend(result['hits']['hits'])
        
        if max_pages and page_number >= max_pages:
            # Stop if reached the maximum specified pages
            break

        page_number += 1

    return all_records

In [15]:
community_name = 'es-replication-repository'
access_token = 'your_access_token_here'
max_pages_to_fetch = None  # Set to None if you want to fetch all pages

# Get all records from the Zenodo community
all_records = get_all_records(community_name, access_token, max_pages=max_pages_to_fetch)

# Normalize the JSON data and create a DataFrame
df = pd.json_normalize(all_records)

In [20]:
communities = ['es-replication-repository', 
               'restud-replication',
               'ej-replication-repository',
               'pemj']

res = []

for community in communities:
    rep_json = get_all_records(community, 
                             zenodo_token, max_pages=max_pages_to_fetch)
    rep_df = pd.json_normalize(rep_json)
    rep_df['community'] = community
    res.append(rep_df)

res_df = pd.concat(res)
res_df

Unnamed: 0,created,modified,id,conceptrecid,doi,conceptdoi,doi_url,title,updated,recid,...,metadata.references,metadata.alternate_identifiers,metadata.version,community,metadata.notes,metadata.journal.issue,metadata.journal.pages,metadata.journal.volume,metadata.dates,metadata.method
0,2023-11-17T15:41:56.526774+00:00,2023-11-17T15:41:56.887045+00:00,10145562,10145561,10.5281/zenodo.10145562,10.5281/zenodo.10145561,https://doi.org/10.5281/zenodo.10145562,"Replication Package for ""A Demand Curve For Di...",2023-11-17T15:41:56.887045+00:00,10145562,...,,,,es-replication-repository,,,,,,
1,2023-10-23T16:30:52.508471+00:00,2023-10-23T16:30:53.536996+00:00,10034618,10034617,10.5281/zenodo.10034618,10.5281/zenodo.10034617,https://doi.org/10.5281/zenodo.10034618,"Replication package for: ""Production and Learn...",2023-10-23T16:30:53.536996+00:00,10034618,...,,,,es-replication-repository,,,,,,
2,2023-10-17T15:47:53.560458+00:00,2023-10-17T15:47:53.859869+00:00,10012820,10012819,10.5281/zenodo.10012820,10.5281/zenodo.10012819,https://doi.org/10.5281/zenodo.10012820,"Replication package for: ""Drilling Deadlines a...",2023-10-17T15:47:53.859869+00:00,10012820,...,,,,es-replication-repository,,,,,,
3,2023-10-09T22:00:51.659856+00:00,2023-10-10T12:50:15.067969+00:00,8423395,8423394,10.5281/zenodo.8423395,10.5281/zenodo.8423394,https://doi.org/10.5281/zenodo.8423395,"Replication package for: ""Same Root Different ...",2023-10-10T12:50:15.067969+00:00,8423395,...,,,,es-replication-repository,,,,,,
4,2023-10-09T19:02:32.581978+00:00,2023-10-10T02:27:07.038243+00:00,8422960,8422959,10.5281/zenodo.8422960,10.5281/zenodo.8422959,https://doi.org/10.5281/zenodo.8422960,"Replication package for: ""A Robust Permutation...",2023-10-10T02:27:07.038243+00:00,8422960,...,,,,es-replication-repository,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,2022-05-06T04:40:30.445208+00:00,2022-05-06T13:49:09.394070+00:00,6523118,6523117,10.5281/zenodo.6523118,10.5281/zenodo.6523117,https://doi.org/10.5281/zenodo.6523118,Library Resources Utilization and Students' Sa...,2022-05-06T13:49:09.394070+00:00,6523118,...,,,,pemj,,,,,,
1427,2022-05-06T03:29:35.470085+00:00,2022-05-06T13:49:38.888730+00:00,6523050,6523049,10.5281/zenodo.6523050,10.5281/zenodo.6523049,https://doi.org/10.5281/zenodo.6523050,"Academic Performance, Personality Types, and S...",2022-05-06T13:49:38.888730+00:00,6523050,...,,,,pemj,,,,,,
1428,2022-05-06T02:57:59.320267+00:00,2022-05-06T13:49:25.754767+00:00,6523038,6523037,10.5281/zenodo.6523038,10.5281/zenodo.6523037,https://doi.org/10.5281/zenodo.6523038,The Learners' Learning Strategies in the Acqui...,2022-05-06T13:49:25.754767+00:00,6523038,...,,,,pemj,,,,,,
1429,2022-05-06T02:39:45.217354+00:00,2022-05-06T13:49:09.336458+00:00,6523025,6523024,10.5281/zenodo.6523025,10.5281/zenodo.6523024,https://doi.org/10.5281/zenodo.6523025,Hospitality Workers' Interaction with Multinat...,2022-05-06T13:49:09.336458+00:00,6523025,...,,,,pemj,,,,,,


In [27]:
## Dowload the data (no other recourse)

for i in res_df['id'][0:10]:
    urlretrieve(f"https://zenodo.org/api/records/{i}/files-archive", f"temp/{i}.zip")

In [73]:
def unzip_folder(zip_file_path, valid_extensions):
    extraction_folder = os.path.splitext(zip_file_path)[0] + "_extracted"
    os.makedirs(extraction_folder, exist_ok=True)

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extraction_folder)

    process_nested_zip_files(extraction_folder, valid_extensions)

def process_nested_zip_files(folder_path, valid_extensions):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.zip'):
                nested_zip_file_path = os.path.join(root, file)
                unzip_folder(nested_zip_file_path, valid_extensions)

    # Remove non-matching files in the current folder
    remove_non_matching_files(folder_path, valid_extensions)

def remove_non_matching_files(folder_path, valid_extensions):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if not file.endswith(tuple(valid_extensions)):
                file_path = os.path.join(root, file)
                os.remove(file_path)

In [74]:
valid_extensions = ['.R', '.py', '.do', '.Rscript', '.ipynb']
folder_path = "temp/"

zip_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.zip')]

for zip_file in zip_files[0:5]:
    unzip_folder(zip_file, valid_extensions)

In [57]:
zip_files

['temp/8335264.zip',
 'temp/8199884.zip',
 'temp/8336416.zip',
 'temp/8423395.zip',
 'temp/8322609.zip',
 'temp/10034618.zip',
 'temp/10145562.zip',
 'temp/8326559.zip',
 'temp/10012820.zip',
 'temp/8422960.zip']