<a href="https://colab.research.google.com/github/pravsels/DistilLM/blob/main/manim/create_dataset_from_git_repos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece] requests
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
import requests
from datasets import load_dataset, Dataset
import os
import subprocess

In [None]:
# put in personal access token (PAT) here
GIT_TOKEN = ''
headers = {'Authorization': f'token {GIT_TOKEN}'}

hf_username='pravsels'

In [None]:
branch="main"
owner_repo_dict = {
    # "3b1b": "manim",
    # "ManimCommunity": "manim",
    "3b1b": "videos",
    "helblazer811": "ManimML",
    "Elteoremadebeethoven": "AnimationsWithManim",
    "Matheart": "manim-physics",
    "brianamedee": "Manim-Tutorials-2021"
}

In [None]:
def fetch_issues(
    owner="3b1b",
    repo="videos",
    issue_state='closed',
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state={issue_state}"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        # import pdb; pdb.set_trace()
        # print(issues.json())
        reformatted_issues = [{'number': x['number'], 'content': (x['title'] if x['title'] is not None else "") + ' : ' + (x['body'] if x['body'] is not None else "")} for x in issues.json()]
        batch.extend(reformatted_issues)

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}_{owner}_issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}_{owner}_issues.jsonl"
    )

In [None]:
def get_comments(owner, repo, issue_number):
  url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/comments"
  response = requests.get(url, headers=headers)
  # print(response.json())
  return [r['body'] for r in response.json()]

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
for owner, repo in owner_repo_dict.items():
  # fetch issues into df, then load as dataset
  fetch_issues(owner, repo)

  issues_dataset = load_dataset("json",
                                data_files=f"./{repo}_{owner}_issues.jsonl",
                                split="train")

  print('Adding comments from the issues!')
  # adding a comments column
  issues_with_comments = issues_dataset.map(
      lambda x: {'comments': get_comments(owner, repo, x['number'])}
  )

  issues_with_comments.push_to_hub(f"{repo}_{owner}_issues")

  print(f'Pushed to repo {owner}/{repo} !')

  0%|          | 0/100 [00:00<?, ?it/s]

Downloaded all the issues for videos! Dataset stored at ./videos_3b1b_issues.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo 3b1b/videos


  0%|          | 0/100 [00:00<?, ?it/s]

Downloaded all the issues for manim! Dataset stored at ./manim_ManimCommunity_issues.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3115 [00:00<?, ? examples/s]

TypeError: string indices must be integers

In [None]:
#############################################################################
# code above is to turn issues (including PRs) and comments into HF datasets
#############################################################################

In [None]:
###########################################################
# code below is to turn repos into datasets (NOT DONE YET)
###########################################################

In [None]:
repo_url = f"https://github.com/{owner}/{repo}.git"
local_dir = f"{repo}"

if not Path(local_dir).exists():
  subprocess.run(["git", "clone", repo_url, local_dir], check=True)
else:
    print(f"Directory '{local_dir}' already exists. Please remove it or choose a different directory.")

NameError: name 'owner' is not defined

In [None]:
def process_file(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
  return content

data = []

for root, dirs, files in os.walk(local_dir):
  for name in files:
    file_path = Path(root) / name

    if file_path.suffix in ['.py', '.rst', '.md', '.yml']:
      file_content = process_file(file_path)

      data.append({'file_path': str(), 'content': file_content})


df = pd.DataFrame(data)

In [None]:
code_dataset = Dataset.from_pandas(df)

In [None]:
code_dataset

Dataset({
    features: ['file_path', 'content'],
    num_rows: 123
})

In [None]:
code_dataset.push_to_hub(f"{hf_username}/{repo}_{owner}_code")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/pravsels/manim_3b1b_code/commit/897f7355500d86965a150dd8c2a04754c4cab6ac', commit_message='Upload dataset', commit_description='', oid='897f7355500d86965a150dd8c2a04754c4cab6ac', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import notebook_login

notebook_login()
issues_with_comments.push_to_hub(f"{hf_username}/{repo}_{owner}_issues")