Skip to content

Commit

Permalink
feat: Sample script to download movie data from HF datasets
Browse files Browse the repository at this point in the history
Can be used for development purposes
  • Loading branch information
prosto committed Dec 19, 2023
1 parent f801a10 commit ed44127
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
1 change: 1 addition & 0 deletions data/movies.json

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions scripts/load_movies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import json
from typing import Any, Dict, Set

from datasets import load_dataset

NUM_ROWS_TO_SAVE = 1000
DATA_FILE = "data/movies.json"


def valid_examples(example: Dict[str, Any]) -> bool:
return (
example["original_language"] == "en"
and example["genres"]
and example["overview"]
and example["title"]
and example["release_date"]
)


unique_ids: Set[Any] = set()


def is_unique(example: Dict[str, Any]) -> bool:
if example["id"] in unique_ids:
return False
else:
unique_ids.add(example["id"])
return True


def convert_to_document(example: Dict[str, Any]) -> Dict[str, Any]:
return {
"id": str(example["id"]),
"content": example["overview"],
"meta": {
"title": example["title"],
"runtime": example["runtime"],
"vote_average": example["vote_average"],
"release_date": example["release_date"],
"genres": example["genres"].split("-"),
},
}


movies_dataset = (
load_dataset("wykonos/movies", split="train")
.filter(valid_examples)
.filter(is_unique)
.shuffle(seed=42)
.select(range(NUM_ROWS_TO_SAVE))
)

movie_documents = [convert_to_document(movie) for movie in movies_dataset]

with open(DATA_FILE, "w") as outfile:
json.dump(movie_documents, outfile)

0 comments on commit ed44127

Please sign in to comment.