|
1 | | -from typing import Iterator |
| 1 | +from pathlib import Path |
| 2 | +from typing import Iterator, List, Optional |
2 | 3 |
|
3 | 4 | import json |
4 | 5 |
|
5 | 6 | from dataset_reader.base_reader import BaseReader, Record, Query |
6 | 7 |
|
7 | 8 |
|
| 9 | +VECTORS_FILE = 'vectors.jsonl' |
| 10 | +PAYLOADS_FILE = 'payloads.jsonl' |
| 11 | +QUERIES_FILE = 'queries.jsonl' |
| 12 | +NEIGHBOURS_FILE = 'neighbours.jsonl' |
| 13 | + |
| 14 | + |
8 | 15 | class JSONReader(BaseReader): |
9 | | - def __init__(self, path): |
| 16 | + def __init__(self, path: Path): |
10 | 17 | self.path = path |
11 | 18 |
|
12 | | - def read_queries(self) -> Iterator[Query]: |
13 | | - with open(self.path, "r") as json_fp: |
| 19 | + def read_payloads(self) -> Iterator[dict]: |
| 20 | + if not (self.path / PAYLOADS_FILE).exists(): |
| 21 | + while True: |
| 22 | + yield {} |
| 23 | + with open(self.path / PAYLOADS_FILE, "r") as json_fp: |
14 | 24 | for json_line in json_fp: |
15 | 25 | line = json.loads(json_line) |
16 | | - yield Query( |
17 | | - vector=line, meta_conditions=None, expected_result=None, |
18 | | - ) |
| 26 | + yield line |
19 | 27 |
|
20 | | - def read_data(self) -> Iterator[Record]: |
21 | | - with open(self.path, "r") as json_fp: |
22 | | - for idx, json_line in enumerate(json_fp): |
| 28 | + def read_vectors(self) -> Iterator[List[float]]: |
| 29 | + with open(self.path / VECTORS_FILE, "r") as json_fp: |
| 30 | + for json_line in json_fp: |
| 31 | + line = json.loads(json_line) |
| 32 | + yield line |
| 33 | + |
| 34 | + def read_neighbours(self) -> Iterator[Optional[List[int]]]: |
| 35 | + if not (self.path / NEIGHBOURS_FILE).exists(): |
| 36 | + while True: |
| 37 | + yield None |
| 38 | + |
| 39 | + with open(self.path / NEIGHBOURS_FILE, "r") as json_fp: |
| 40 | + for json_line in json_fp: |
| 41 | + line = json.loads(json_line) |
| 42 | + yield line |
| 43 | + |
| 44 | + def read_query_vectors(self) -> Iterator[List[float]]: |
| 45 | + with open(self.path / QUERIES_FILE, "r") as json_fp: |
| 46 | + for json_line in json_fp: |
23 | 47 | line = json.loads(json_line) |
24 | | - yield Record(id=idx, vector=line, metadata=None) |
| 48 | + yield line |
| 49 | + |
| 50 | + def read_queries(self) -> Iterator[Query]: |
| 51 | + for idx, (vector, neighbours) in enumerate(zip(self.read_query_vectors(), self.read_neighbours())): |
| 52 | + # ToDo: add meta_conditions |
| 53 | + yield Query(vector=vector, meta_conditions=None, expected_result=neighbours) |
| 54 | + |
| 55 | + def read_data(self) -> Iterator[Record]: |
| 56 | + for idx, (vector, payload) in enumerate(zip(self.read_vectors(), self.read_payloads())): |
| 57 | + yield Record(id=idx, vector=vector, metadata=payload) |
25 | 58 |
|
26 | 59 |
|
27 | 60 | if __name__ == "__main__": |
28 | | - import os |
29 | | - from benchmark.settings import DATASET_DIR |
| 61 | + from benchmark import DATASETS_DIR |
30 | 62 |
|
31 | | - test_path = os.path.join(DATASET_DIR, "random-100", "vectors.jsonl") |
| 63 | + test_path = DATASETS_DIR / "random-100" |
32 | 64 | record = next(JSONReader(test_path).read_data()) |
33 | 65 | print(record, end="\n\n") |
34 | 66 |
|
|
0 commit comments