/
question_answering.py
75 lines (66 loc) · 2.47 KB
/
question_answering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import logging
import os
from pathlib import Path
from datetime import datetime
from transformers import TrainingArguments
from datasets import DatasetDict
from nlp_training.encoder.train_question_answering import train_encoder
from nlp_datasets.preprocessing.common_utils import filter_texts_over_tokens_threshold
from nlp_datasets.get_dataset import get_squad_dataset
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s")
date = datetime.now().strftime("%H-%M-%S-%d-%m-%Y")
results_dir = Path(__file__).parent.parent.parent / "results" / \
"question_answering" / "train" / f"exp_{date}"
print(f"Results directory: {results_dir}")
### START CONFIGURATION PARAMETERS ###
training_arguments = TrainingArguments(
output_dir=results_dir,
num_train_epochs=5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
gradient_accumulation_steps=1,
learning_rate=0.0001,
lr_scheduler_type="linear",
optim="adamw_torch",
eval_accumulation_steps=1,
evaluation_strategy="steps",
eval_steps=0.01,
save_strategy="steps",
save_steps=0.01,
logging_strategy="steps",
logging_steps=1,
report_to="tensorboard",
do_train=True,
do_eval=True,
max_grad_norm=0.3,
warmup_ratio=0.03,
group_by_length=True,
dataloader_drop_last=False,
fp16=False,
bf16=False
)
base_model_name_or_path: str = "distilbert-base-uncased"
max_tokens_length: int = 512
### END CONFIGURATION PARAMETERS ###
### START DATASET ###
dataset: DatasetDict = get_squad_dataset()
del dataset['test']
context_feat: str = "context"
# PREPROCESS DATASET
for split in dataset.keys():
print(f"Preprocessing {split} split...")
dataset[split] = filter_texts_over_tokens_threshold(dataset=dataset[split],
text_feat=context_feat,
tokenizer_name=base_model_name_or_path,
tokens_threshold=max_tokens_length)
# START TRAINING
print(f"Dataset:\n{dataset}")
print(f"Training {base_model_name_or_path} model...")
os.makedirs(results_dir, exist_ok=True)
train_encoder(base_model_name_or_path=base_model_name_or_path,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
max_tokens_length=max_tokens_length,
training_args=training_arguments)
print("Training succesfully completed")