Using the example from the [SS tutorial](https://www.semanticscholar.org/product/api/tutorial#retrieving-releases-datasets-and-download-links)

In [1]:
import os
import requests
from dotenv import load_dotenv

In [2]:
# Define base URL for datasets API
base_url = "https://api.semanticscholar.org/datasets/v1/release/"

In [3]:
load_dotenv(verbose=True)
api_key = os.getenv('S2_API_KEY')
headers = {"x-api-key": api_key}

In [4]:
response = requests.get(base_url)
data = response.json()

In [5]:
data

['2022-05-10',
 '2022-05-17',
 '2022-05-24',
 '2022-05-31',
 '2022-06-07',
 '2022-06-14',
 '2022-06-21',
 '2022-06-28',
 '2022-07-05',
 '2022-07-19',
 '2022-07-28',
 '2022-08-02',
 '2022-08-09',
 '2022-08-16',
 '2022-08-23',
 '2022-08-30',
 '2022-09-06',
 '2022-09-13',
 '2022-09-28',
 '2022-10-05',
 '2022-10-28',
 '2022-11-02',
 '2022-11-11',
 '2022-11-15',
 '2022-11-22',
 '2022-12-02',
 '2022-12-06',
 '2022-12-13',
 '2022-12-20',
 '2022-12-27',
 '2023-01-03',
 '2023-01-10',
 '2023-01-17',
 '2023-01-24',
 '2023-01-31',
 '2023-02-07',
 '2023-02-14',
 '2023-02-21',
 '2023-02-28',
 '2023-03-07',
 '2023-03-14',
 '2023-03-21',
 '2023-03-28',
 '2023-04-06',
 '2023-04-11',
 '2023-04-18',
 '2023-05-09',
 '2023-05-16',
 '2023-05-23',
 '2023-05-30',
 '2023-06-06',
 '2023-06-13',
 '2023-06-20',
 '2023-07-04',
 '2023-07-11',
 '2023-07-25',
 '2023-08-01',
 '2023-08-08',
 '2023-08-15',
 '2023-08-29',
 '2023-09-05',
 '2023-09-12',
 '2023-09-19',
 '2023-09-26',
 '2023-10-10',
 '2023-10-19',
 '2023-10-

In [6]:
release_id = data[-2]

In [7]:
release_id

'2024-05-07'

In [8]:
datasets_response = requests.get(base_url + release_id)

In [9]:
datasets_response.status_code

200

In [10]:
datasets = datasets_response.json()['datasets']
datasets

[{'name': 'abstracts',
  'description': 'Paper abstract text, where available.\n100M records in 30 1.8GB files.',
  'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "abstracts" dataset provides abstract text for selected papers.\n\nSCHEMA\n - openAccessInfo\n   - externalIds: IDs of this paper in different catalogs\n   - license/url/status: open-access information provided by Unpaywall, linked by DOI or PubMed Central ID\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://opendatacommons.org/licenses/by/1.0/)\n\nBy downloading this data you acknowledge that you have read and agreed to all the terms in this license.\n\nATTRIBUTION\nWhen using this data in a product or service, or including data in a redistribution, please cite the following paper:\n\nBibTex format:\n@misc{https://doi.org/10.48550/arxiv.2301.10140,\n  title = {The Semantic Scholar Open Data Platform},\n  author = {Kinney, Rodney and Anastasiades, Chloe and Authur, Russell and Beltagy, Iz and Bragg, J

In [11]:
import json

for dataset in datasets:
   # print(dataset['name'], dataset['description'])
   with open(f"{dataset['name']}.json", "w") as jfile:
       jfile.write(json.dumps(dataset, indent=4))

In [12]:
dataset_name = 'papers'
download_links_response = requests.get(base_url + release_id + '/dataset/' + dataset_name, headers=headers)


In [13]:
download_links_response.status_code

200

In [14]:
download_links = download_links_response.json()["files"]

In [15]:
download_links

['https://ai2-s2ag.s3.amazonaws.com/staging/2024-05-07/papers/20240510_070337_00047_jyg5a_00b0e403-bf1c-4ba4-b317-2caae124ce41.gz?AWSAccessKeyId=ASIA5BJLZJPW4UF4GP6W&Signature=Gpoc3a5wGuzoANyQUk7qYDFMneE%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDqZcQP4TILyiYRs1V1CImRwES3cpj6drMg%2F3s4lCTWQQIhAMnAxPNIkaM%2BdD0ZQXUilUcy6%2FqGaRxI0HWYLbwDlcK5Kv8DCEQQABoMODk2MTI5Mzg3NTAxIgz%2BwS6P63UIxRFv3Ecq3AN6MhLvPlBleb%2BfBb8ofACuGFO8jlNXdt0hawwFHt8UbwykB%2BScONIe1F5w0dFRkeuT0sebJyPu2DBvWc6HOhoGPJ1uvVmKBwvNEoujqLN9zhRzLb8WNvrrTcPYqTJkAeNFBVFqMui2y0gf1YWzPkpnhwTJGuVxPDfP382BNipiSmJqEJClRM0BK2iEk4m6EXWjjbDLxWGOqu3RChWuha9pe%2FxzhQcxPl0mZE3Cv5aQYoBT4YqCg9PpYhGhgDbRVc2JGT0PLTKBrMACyQAKc3Za1nj0nCTxS81qaIAc5hzCCLTfhEfEKRzh%2FwdD4Hm0KcaAawWtqz1CqMkAFywSYU1Bdh%2BBLfO6zNj5kyPD0iiskZiVmBQ9%2B0g8nE7En2nT%2FI%2BLrA09SAR6WjwVkjoRgU8oEA0%2BegJKMU%2BbbTgBMo%2FA5hEDMrrB5RIeabpIMtJ4vdxzEVW96%2FNe7nYSuRcpS7bMXuUUEOAeUbQskkIP311C0HA8hl1LG3TOLVPZtn3GcEkZtlZ0TalBoQyu

In [16]:
download_links[0]

'https://ai2-s2ag.s3.amazonaws.com/staging/2024-05-07/papers/20240510_070337_00047_jyg5a_00b0e403-bf1c-4ba4-b317-2caae124ce41.gz?AWSAccessKeyId=ASIA5BJLZJPW4UF4GP6W&Signature=Gpoc3a5wGuzoANyQUk7qYDFMneE%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEMv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDqZcQP4TILyiYRs1V1CImRwES3cpj6drMg%2F3s4lCTWQQIhAMnAxPNIkaM%2BdD0ZQXUilUcy6%2FqGaRxI0HWYLbwDlcK5Kv8DCEQQABoMODk2MTI5Mzg3NTAxIgz%2BwS6P63UIxRFv3Ecq3AN6MhLvPlBleb%2BfBb8ofACuGFO8jlNXdt0hawwFHt8UbwykB%2BScONIe1F5w0dFRkeuT0sebJyPu2DBvWc6HOhoGPJ1uvVmKBwvNEoujqLN9zhRzLb8WNvrrTcPYqTJkAeNFBVFqMui2y0gf1YWzPkpnhwTJGuVxPDfP382BNipiSmJqEJClRM0BK2iEk4m6EXWjjbDLxWGOqu3RChWuha9pe%2FxzhQcxPl0mZE3Cv5aQYoBT4YqCg9PpYhGhgDbRVc2JGT0PLTKBrMACyQAKc3Za1nj0nCTxS81qaIAc5hzCCLTfhEfEKRzh%2FwdD4Hm0KcaAawWtqz1CqMkAFywSYU1Bdh%2BBLfO6zNj5kyPD0iiskZiVmBQ9%2B0g8nE7En2nT%2FI%2BLrA09SAR6WjwVkjoRgU8oEA0%2BegJKMU%2BbbTgBMo%2FA5hEDMrrB5RIeabpIMtJ4vdxzEVW96%2FNe7nYSuRcpS7bMXuUUEOAeUbQskkIP311C0HA8hl1LG3TOLVPZtn3GcEkZtlZ0TalBoQyuW

In [17]:
for (i, link) in enumerate(download_links[3:]):
    response = requests.get(link)
    if response.status_code != 200:
        break
    with open(f"/home/romilly/corpus202404/file{i+2}.gz","wb") as pf:
        pf.write(response.content)
    