In [1]:
# we will create a dataset for github issues for the dataset project in hf
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)


In [2]:
from dotenv import dotenv_values
__secrets = dotenv_values('../.env')

In [3]:
git_token = __secrets["GITHUB_TOKEN"]

In [4]:
headers = {"Authorization": f"token {git_token}"}

In [40]:
import json
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

def get_data(post_id):
    url = f"https://hacker-news.firebaseio.com/v0/item/{post_id}.json"
    resp = requests.get(url)
    
    if resp.status_code == 200:
        json_data = resp.json()
        return json_data
    
    raise ValueError(f"Cannot find article with post id {post_id}")

# fetching hackernews titles 
def fetch_titles(size=1000, issues_path=Path(".")) :
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    top_posts_resp = requests.get("https://hacker-news.firebaseio.com/v0/topstories.json")
    
    top_titles = []
    if top_posts_resp.status_code == 200:
        top_ids = top_posts_resp.json()[:size]
        
        try:
            for idx in tqdm(range(len(top_ids))):
                _id = top_ids[idx]                
                top_titles.append(get_data(_id))
        except Exception as e:
            print(e)
            pass
    
    print(top_titles)
    df = pd.DataFrame.from_records(top_titles)
    df.to_json(f"{issues_path}/hn_posts.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for hackernews! Dataset stored at {issues_path}/hn_posts.jsonl"
    )            

In [42]:
fetch_titles()

  0%|          | 0/500 [00:00<?, ?it/s]

[{'by': 'oumua_don17', 'descendants': 45, 'id': 40329388, 'kids': [40330406, 40330686, 40330401, 40331976, 40332106, 40331207, 40331570, 40330433, 40330175, 40330460, 40331190, 40329853, 40331467, 40331183, 40331634, 40329921, 40329923, 40331992, 40330563, 40329868, 40329755], 'score': 507, 'time': 1715447768, 'title': 'Immersive Math', 'type': 'story', 'url': 'http://immersivemath.com/ila/index.html'}, {'by': 'smitty1e', 'descendants': 4, 'id': 40331594, 'kids': [40331985, 40332029, 40331956], 'score': 23, 'time': 1715474918, 'title': 'Ancient Egyptian Stone-Drilling (1983)', 'type': 'story', 'url': 'https://www.penn.museum/sites/expedition/ancient-egyptian-stone-drilling/'}, {'by': 'goldenskye', 'descendants': 0, 'id': 40331886, 'score': 20, 'time': 1715479223, 'title': 'Virtualizing the 6502 on a 6502 with 6o6', 'type': 'story', 'url': 'http://oldvcr.blogspot.com/2024/04/virtualizing-6502-with-6o6-and.html'}, {'by': 'roykishony', 'descendants': 0, 'id': 40331850, 'score': 15, 'time'

In [55]:
from datasets import load_dataset
hn_dataset = load_dataset("json", data_files="hn_posts.jsonl", split="train")
hn_dataset

Dataset({
    features: ['by', 'descendants', 'id', 'kids', 'score', 'time', 'title', 'type', 'url', 'text'],
    num_rows: 500
})

In [57]:
# cleaning up the data a bit, eg: removing descendants column and id
hn_dataset = hn_dataset.remove_columns(["kids", "descendants", "time"])

In [58]:
hn_dataset[0]


{'by': 'oumua_don17',
 'id': 40329388,
 'score': 507,
 'title': 'Immersive Math',
 'type': 'story',
 'url': 'http://immersivemath.com/ila/index.html',
 'text': None}

In [63]:
# uploading the dataset to HF
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [64]:
hn_dataset.push_to_hub("hn-posts")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/riaz/hn-posts/commit/ef4fa3cb3729bb8bb11780840dc447472c531a19', commit_message='Upload dataset', commit_description='', oid='ef4fa3cb3729bb8bb11780840dc447472c531a19', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
### Now that we have uploaded our dataset, next we will load it

remote_dataset = load_dataset("riaz/hn_posts", split="train")