<a href="https://colab.research.google.com/github/pravsels/DistilLM/blob/main/manim/create_dataset_from_git_repos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets evaluate transformers[sentencepiece] requests
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, multiproc

In [2]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
import requests
from datasets import load_dataset, Dataset
import os
import subprocess

In [3]:
# put in personal access token (PAT) here
GIT_TOKEN = ''
headers = {'Authorization': f'token {GIT_TOKEN}'}

hf_username='pravsels'

In [25]:
branch="main"
owner_repo_dict_list = [
    {'owner': "3b1b", 'repo': "manim"},
    {'owner': "ManimCommunity", 'repo': "manim"},
    {'owner': "3b1b", 'repo': "videos"},
    {'owner': "helblazer811", 'repo': "ManimML"},
    {'owner': "Elteoremadebeethoven", 'repo': "AnimationsWithManim"},
    {'owner': "Matheart", 'repo': "manim-physics"},
    {'owner': "brianamedee", 'repo': "Manim-Tutorials-2021"}  # no closed issues for this repo
]

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#############################################################################
# code to turn issues (including PRs) and comments into HF datasets (BEGIN)
#############################################################################

In [26]:
def fetch_issues(
    owner="3b1b",
    repo="videos",
    issue_state='closed',
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state={issue_state}"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        # import pdb; pdb.set_trace()
        # print(issues.json())
        reformatted_issues = [{'number': x['number'], 'content': (x['title'] if x['title'] is not None else "") + ' : ' + (x['body'] if x['body'] is not None else "")} for x in issues.json()]
        batch.extend(reformatted_issues)

        # if len(batch) > rate_limit and len(all_issues) < num_issues:
        #     all_issues.extend(batch)
        #     batch = []  # Flush batch for next time period
        #     print(f"Reached GitHub rate limit. Sleeping for one hour ...")
        #     time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}_{owner}_issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}_{owner}_issues.jsonl"
    )

In [27]:
def get_comments(owner, repo, issue_number):
  url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/comments"
  response = requests.get(url, headers=headers)
  # print(response.json())
  return [r['body'] for r in response.json()]

In [28]:
for element in owner_repo_dict_list:
  owner, repo = element['owner'], element['repo']
  # fetch issues into df, then load as dataset
  fetch_issues(owner, repo)

  issues_dataset = load_dataset("json",
                                data_files=f"./{repo}_{owner}_issues.jsonl",
                                split="train")

  print('Adding comments from the issues!')
  # adding a comments column
  issues_with_comments = issues_dataset.map(
      lambda x: {'comments': get_comments(owner, repo, x['number'])}
  )

  issues_with_comments.push_to_hub(f"{repo}_{owner}_issues")

  print(f'Pushed to repo {owner}/{repo} !')

  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
#############################################################################
# code to turn issues (including PRs) and comments into HF datasets (END)
#############################################################################

In [None]:
###########################################################
# code to turn repos into datasets (BEGIN)
###########################################################

In [29]:
def process_file(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
  return content

In [30]:
for element in owner_repo_dict_list:
  owner, repo = element['owner'], element['repo']

  repo_url = f"https://github.com/{owner}/{repo}.git"
  local_dir = f"{repo}_{owner}"

  if not Path(local_dir).exists():
    subprocess.run(["git", "clone", repo_url, local_dir], check=True)
  else:
      print(f"Directory '{local_dir}' already exists. Please remove it or choose a different directory.")

  data = []

  for root, dirs, files in os.walk(local_dir):
    for name in files:
      file_path = Path(root) / name

      if file_path.suffix in ['.py', '.rst', '.md', '.yml']:
        file_content = process_file(file_path)

        data.append({'file_path': str(file_path), 'content': file_content})


  df = pd.DataFrame(data)
  code_dataset = Dataset.from_pandas(df)

  code_dataset.push_to_hub(f"{hf_username}/{repo}_{owner}_code")

  print(f'Pushed to repo {owner}/{repo} !')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo 3b1b/manim !
Directory 'manim_ManimCommunity' already exists. Please remove it or choose a different directory.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo ManimCommunity/manim !
Directory 'videos_3b1b' already exists. Please remove it or choose a different directory.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo 3b1b/videos !
Directory 'ManimML_helblazer811' already exists. Please remove it or choose a different directory.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo helblazer811/ManimML !
Directory 'AnimationsWithManim_Elteoremadebeethoven' already exists. Please remove it or choose a different directory.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo Elteoremadebeethoven/AnimationsWithManim !
Directory 'manim-physics_Matheart' already exists. Please remove it or choose a different directory.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo Matheart/manim-physics !
Directory 'Manim-Tutorials-2021_brianamedee' already exists. Please remove it or choose a different directory.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushed to repo brianamedee/Manim-Tutorials-2021 !


In [None]:
###########################################################
# code to turn repos into datasets (END)
###########################################################

In [None]:
###########################################################
# code to collate HF datasets into a single pile (BEGIN)
###########################################################

In [51]:
hf_misc_datasets = [
    # 'mediciresearch/manimation',
    # 'Edoh/manim_python',
]

hf_issues_datasets = [
    'pravsels/manim_3b1b_issues',
    'pravsels/manim_ManimCommunity_issues',
    'pravsels/ManimML_helblazer811_issues',
    'pravsels/videos_3b1b_issues',
    'pravsels/AnimationsWithManim_Elteoremadebeethoven_issues',
    'pravsels/manim-physics_Matheart_issues',
]

hf_code_datasets = [
    'pravsels/manim_3b1b_code',
    'pravsels/manim_ManimCommunity_code',
    'pravsels/ManimML_helblazer811_code',
    'pravsels/videos_3b1b_code',
    'pravsels/AnimationsWithManim_Elteoremadebeethoven_code',
    'pravsels/manim-physics_Matheart_code',
    'pravsels/Manim-Tutorials-2021_brianamedee_code',
]

In [54]:
def process_dataset(dataset_id, split='train'):
  dataset = load_dataset(dataset_id, split=split)
  processed_entries = []
  for entry in tqdm(dataset):
    context = entry.get('content', '')
    comments = entry.get('comments', [])

    comments_text = ' '.join(comments) if comments else ' '
    # Combine context and comments
    text = context + ' ' + comments_text

    # Create a dictionary for each entry
    processed_entry = {
      'metadata': {
        'dataset_name': dataset_id,
        'filename': entry.get('filename', 'N/A')
      },
      'text': text
    }
    processed_entries.append(processed_entry)

  return processed_entries

# Combine all datasets
all_processed_entries = []

for dataset_id in hf_issues_datasets + hf_code_datasets:
  all_processed_entries.extend(process_dataset(dataset_id))


  0%|          | 0/1595 [00:00<?, ?it/s]

  0%|          | 0/3115 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/123 [00:00<?, ?it/s]

  0%|          | 0/411 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/353 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

In [55]:
dataset_id='mediciresearch/manimation'
dataset = load_dataset(dataset_id, split='train')
processed_entries = []

for entry in tqdm(dataset):
  # Create a dictionary for each entry
  processed_entry = {
      'metadata': {
          'dataset_name': dataset_id,
          'filename': entry.get('filename', 'N/A')
      },
      'text': entry.get('text', '')
  }
  processed_entries.append(processed_entry)
all_processed_entries.extend(processed_entries)

  0%|          | 0/1023 [00:00<?, ?it/s]

In [56]:
dataset_id='Edoh/manim_python'
dataset = load_dataset(dataset_id, split='train')
processed_entries = []

for entry in tqdm(dataset):
  # Create a dictionary for each entry
  processed_entry = {
      'metadata': {
          'dataset_name': dataset_id,
          'filename': entry.get('filename', 'N/A')
      },
      'text': entry.get('instruction', '') + ' : ' + entry.get('output', '')
  }
  processed_entries.append(processed_entry)
all_processed_entries.extend(processed_entries)

  0%|          | 0/599 [00:00<?, ?it/s]

In [57]:
# Create the unified 'manim_pile' dataset
manim_pile_dataset = Dataset.from_pandas(pd.DataFrame(all_processed_entries))

In [58]:
manim_pile_dataset.push_to_hub(f"{hf_username}/manim_pile")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/391 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/pravsels/manim_pile/commit/a3ce7c9794d7beae50f3ec56f5f8a5528adeb3b5', commit_message='Upload dataset', commit_description='', oid='a3ce7c9794d7beae50f3ec56f5f8a5528adeb3b5', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
###########################################################
# code to collate HF datasets into a single pile (END)
###########################################################