-
Notifications
You must be signed in to change notification settings - Fork 0
/
insuranceqa.py
53 lines (34 loc) · 1.41 KB
/
insuranceqa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
import logging
import fire
import sys
import tarfile
import shutil
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
BASE_URL = "https://github.com/rafaelvp-db/word2vec-get-started/raw/master/corpus/insuranceqa/questions/{}.questions.txt"
def fetch_dataset(split: str = "train", download_path: str = "/tmp"):
try:
logger.info("Downloading Insurance QA dataset")
response = requests.get(BASE_URL.format(split))
with open(f"{download_path}/insuranceqa_{split}.txt", "w") as file:
logger.info(f"Saving InsuranceQA dataset ({split}) to {download_path}...")
file.write(response.text)
sys.exit(0)
except Exception as exception:
logger.error(f"Error: {str(exception)}")
sys.exit(-1)
def fetch_answers(download_path = "/tmp"):
response = requests.get("https://github.com/chatopera/insuranceqa-corpus-zh/raw/release/corpus/pool/train.json.gz", stream = True)
if response.status_code == 200:
with open(f"{download_path}/answers.tar.gz", "wb") as file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, file)
else:
raise Exception("HTTP Error")
# Decompress
tar = tarfile.open(f"{download_path}/answers.tar.gz", "r:gz")
tar.extractall()
tar.close()
if __name__ == "__main__":
fire.Fire(fetch_answers)