-
Notifications
You must be signed in to change notification settings - Fork 36
/
7b.yaml
executable file
·142 lines (127 loc) · 2.83 KB
/
7b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
data_local: /scratch/gpfs/mengzhou/space2/examples/examples/llm/llama-dedup
data_remote: # If blank, files must be present in data_local
tokenizer_name: /projects/DANQIC/mengzhou/LLaMA2/hf-7B
max_seq_len: 4096
global_seed: 17
# Run Name
run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME
model:
name: mosaic_llama2_7b
path: /projects/DANQIC/mengzhou/LLaMA2/mosaic-7B/state_dict.pt
init_device: "cpu"
tokenizer_name: ${tokenizer_name}
d_model: 4096
n_heads: 32
n_layers: 32
intermediate_size: 11008
max_seq_len: ${max_seq_len}
vocab_size: 32000
init_std: 0.02
attn_pdrop: 0.0
resid_pdrop: 0.0
emb_pdrop: 0.0
attn_impl: flash
rms_norm_eps: 1e-5
l0_module:
start_sparsity: 0.0
target_sparsity: 0.5
pruning_modules: ["head", "head_layer", "mlp", "intermediate"]
lagrangian_warmup_steps: 320ba
target_model:
d_model: 2560
n_layers: 32
n_heads: 20
intermediate_size: 6912
vocab_size: 32000
# Tokenizer
tokenizer:
type: hftokenizer
args:
tokenizer_name: ${tokenizer_name}
max_seq_len: ${max_seq_len}
# Dataloaders
train_loader:
name: text
dataset:
local: ${data_local}
remote: ${data_remote}
split: train_small
shuffle: true
tokenizer_name: ${tokenizer_name}
max_seq_len: ${max_seq_len}
shuffle_seed: ${global_seed}
is_uint16: true
drop_last: true
num_workers: 8
eval_loader:
name: text
dataset:
local: ${data_local}
remote: ${data_remote}
split: val_small
shuffle: false
tokenizer_name: ${tokenizer_name}
max_seq_len: ${max_seq_len}
shuffle_seed: ${global_seed}
is_uint16: true
drop_last: false
num_workers: 8
# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1
optimizer:
name: decoupled_adamw
lr: 1e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0
lag_lr: 1.0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 800ba # ~ 134B tokens
eval_interval: 100ba
eval_subset_num_batches: 1000
global_train_batch_size: 32
# System
seed: ${global_seed}
device_eval_batch_size: 8
device_train_microbatch_size: 8
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: DEFAULT
activation_checkpointing: true
activation_cpu_offload: false
verbose: false
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
memory_monitor: {}
data_loading:
dynamic: false
update_type: doremi
proportion:
set_names:
target_loss:
loggers:
wandb:
project: pruning
name: ${run_name}
entity: pruning
init_kwargs:
mode: offline
dir: wandb_dir
# Checkpoint to local filesystem or remote object store
save_interval: 800ba
save_folder: save_dir