### Below are some snapshots of the steps for training and generating from a model using the toy datasets in test/toy_data/. 
Note: To demonstrate how a model learns to fit a particular training set, the same data (the files prefixed with "train") is used for both validation and generation. To evaluate a model in a "real" experimental setting, it should instead be applied to held-out data (e.g. the files prefixed with "eval").

#### Infilling

In [2]:
'''Define data and model files'''

# Infilling toy data
train_src_file = "test/toy_data/infilling/train.src"
train_tgt_file = "test/toy_data/infilling/train.tgt"
model_config_file = "test/test_configs/gpt2_lm_config.json"
save_model_dir = "infilling_test_model"
gen_output_file = "test/toy_data/infilling/train.gen"



In [3]:
'''Show format of source-target pairs used for training'''

with open(train_src_file) as src_f,\
    open(train_tgt_file) as tgt_f:
    src_texts = [text.strip() for text in src_f]
    tgt_texts = [text.strip() for text in tgt_f]

for src, tgt in zip(src_texts, tgt_texts):
    print("{:15s} {}".format("SOURCE:", src))
    print("{:15s} {}\n".format("TARGET:", tgt))

SOURCE:         Harry Potter Philosopher's Stone
TARGET:         Harry Potter and the Philosopher's Stone

SOURCE:         Harry Potter and Secrets
TARGET:         Harry Potter and the Chamber of Secrets

SOURCE:         Harry the Prisoner of Azkaban
TARGET:         Harry Potter and the Prisoner of Azkaban

SOURCE:         Potter and the Goblet of
TARGET:         Harry Potter and the Goblet of Fire

SOURCE:         Harry Potter the Order the Phoenix
TARGET:         Harry Potter and the Order of the Phoenix

SOURCE:         and the Half-Blood Prince
TARGET:         Harry Potter and the Half-Blood Prince

SOURCE:         Harry Hallows
TARGET:         Harry Potter and the Deathly Hallows



Visualize early training results

In [7]:
'''Train a model to predict target texts from source inputs'''

!python train_script.py\
    -train_src_file {train_src_file}\
    -train_tgt_file {train_tgt_file}\
    -eval_src_file {train_src_file}\
    -eval_tgt_file {train_tgt_file}\
    -config_file {model_config_file}\
    -save_dir {save_model_dir}\
    -patience 5\
    -valid_epoch_end


{'save_dir': 'infilling_test_model', 'config_file': 'test/test_configs/gpt2_lm_config.json', 'train_src_file': 'test/toy_data/infilling/train.src', 'train_tgt_file': 'test/toy_data/infilling/train.tgt', 'eval_src_file': 'test/toy_data/infilling/train.src', 'eval_tgt_file': 'test/toy_data/infilling/train.tgt', 'train_ref_file': None, 'eval_ref_file': None, 'max_src_length': 25, 'max_tgt_length': 75, 'load_from_dir': None, 'pg_metrics': [], 'eval_metrics': [], 'batch_size': 32, 'max_epochs': 100, 'learning_rate': 0.001, 'patience': 5, 'dynamic_lr': False, 'warmup_steps': 4000, 'max_grad_norm': 5.0, 'accum_steps': 1, 'log_iterations': 100, 'valid_iterations': 1000, 'valid_epoch_end': True}
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.construct:Saved model configuration to infilling_test_model/hparams.json
INFO:texgen.construct:Created LM mode

In [8]:
'''Generate texts for source inputs'''

!python generation_script.py\
    -src_texts_file {train_src_file}\
    -model_dir {save_model_dir}\
    -gen_texts_file {gen_output_file}\
    -infer_method sample\
    -sample_p 0.7\
    -verbose\


{'model_dir': 'infilling_test_model', 'gen_texts_file': 'test/toy_data/infilling/train.gen', 'src_texts_file': 'test/toy_data/infilling/train.src', 'max_decoding_length': 100, 'n_gen_per_src': 1, 'batch_size': 64, 'min_postproc_length': None, 'max_postproc_length': None, 'max_redundancy_rate': None, 'block_repeat': False, 'block_quotes': False, 'block_profanity': False, 'require_paired_punct': False, 'require_eos_punct': False, 'require_src_in_gen': False, 'force_src_in_regen': False, 'max_gen_attempts': 1, 'fallback_to_src': False, 'infer_method': 'sample', 'sample_top_k': 0, 'sample_p': 0.7, 'sample_temperature': 1.0, 'verbose': True}
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.construct:Loaded LM model from infilling_test_model/1637107510.10613.pt
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.generate:Starting generation round 1, attempt 1...
INFO:texgen.generate:round

In [9]:
'''View generated texts'''

with open(train_src_file) as src_f,\
    open(gen_output_file) as gen_f:
    src_texts = [text.strip() for text in src_f]
    gen_texts = [text.strip() for text in gen_f]

for src, gen in zip(src_texts, gen_texts):
    print("{:15s} {}".format("SOURCE:", src))
    print("{:15s} {}\n".format("GENERATED:", gen))

SOURCE:         Harry Potter Philosopher's Stone
GENERATED:      and of the and Potter

SOURCE:         Harry Potter and Secrets
GENERATED:      and Potter the the theHarry and theHarry's and the of the

SOURCE:         Harry the Prisoner of Azkaban
GENERATED:      PotterHarry

SOURCE:         Potter and the Goblet of
GENERATED:      the PotterHarryHarry the the the

SOURCE:         Harry Potter the Order the Phoenix
GENERATED:      of the and the of Potter and the of of and and of and of and PotterHarry the

SOURCE:         and the Half-Blood Prince
GENERATED:      

SOURCE:         Harry Hallows
GENERATED:      Potter the theHarry and and of and and the and of PotterHarryHarry of the of and and the and of the of the PotterHarry the the theHarry the and the and



Continue training to improve generation

In [10]:
'''Resume training'''

!python train_script.py\
    -train_src_file {train_src_file}\
    -train_tgt_file {train_tgt_file}\
    -eval_src_file {train_src_file}\
    -eval_tgt_file {train_tgt_file}\
    -config_file {model_config_file}\
    -save_dir {save_model_dir}\
    -load_from_dir {save_model_dir}\
    -patience 50\
    -valid_epoch_end


{'save_dir': 'infilling_test_model', 'config_file': 'test/test_configs/gpt2_lm_config.json', 'train_src_file': 'test/toy_data/infilling/train.src', 'train_tgt_file': 'test/toy_data/infilling/train.tgt', 'eval_src_file': 'test/toy_data/infilling/train.src', 'eval_tgt_file': 'test/toy_data/infilling/train.tgt', 'train_ref_file': None, 'eval_ref_file': None, 'max_src_length': 25, 'max_tgt_length': 75, 'load_from_dir': 'infilling_test_model', 'pg_metrics': [], 'eval_metrics': [], 'batch_size': 32, 'max_epochs': 100, 'learning_rate': 0.001, 'patience': 50, 'dynamic_lr': False, 'warmup_steps': 4000, 'max_grad_norm': 5.0, 'accum_steps': 1, 'log_iterations': 100, 'valid_iterations': 1000, 'valid_epoch_end': True}
INFO:texgen.train:Loading model configuration from infilling_test_model. All hyperparameter settings will be read from here and will override any settings provided as command-line arguments.
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
Usi

2021-11-16 16:19:28 : Epoch 17, valid result = {Average: 16.470}
[32mINFO 2021-11-16 16:19:28 : [0mPrevious checkpoint 1637108360.819128.pt removed due to `max_to_keep`(=1) limit
[32mINFO 2021-11-16 16:19:30 : [0mCurrent checkpoint saved to infilling_test_model/1637108368.203233.pt
2021-11-16 16:19:35 : Epoch 18, valid result = {Average: 15.319}
[32mINFO 2021-11-16 16:19:35 : [0mPrevious checkpoint 1637108368.203233.pt removed due to `max_to_keep`(=1) limit
[32mINFO 2021-11-16 16:19:39 : [0mCurrent checkpoint saved to infilling_test_model/1637108375.242122.pt
2021-11-16 16:19:46 : Epoch 19, valid result = {Average: 15.402}
[32mINFO 2021-11-16 16:19:46 : [0mEarly stopping patience decrease to 48
2021-11-16 16:19:50 : Epoch 20, valid result = {Average: 12.980}
[32mINFO 2021-11-16 16:19:50 : [0mPrevious checkpoint 1637108375.242122.pt removed due to `max_to_keep`(=1) limit
[32mINFO 2021-11-16 16:19:53 : [0mCurrent checkpoint saved to infilling_test_model/1637108390.387085.pt

[32mINFO 2021-11-16 16:23:11 : [0mCurrent checkpoint saved to infilling_test_model/1637108589.292416.pt
2021-11-16 16:23:15 : Epoch 52, valid result = {Average: 0.099}
[32mINFO 2021-11-16 16:23:15 : [0mPrevious checkpoint 1637108589.292416.pt removed due to `max_to_keep`(=1) limit
[32mINFO 2021-11-16 16:23:17 : [0mCurrent checkpoint saved to infilling_test_model/1637108595.282659.pt
2021-11-16 16:23:21 : Epoch 53, valid result = {Average: 0.251}
[32mINFO 2021-11-16 16:23:21 : [0mEarly stopping patience decrease to 37
2021-11-16 16:23:25 : Epoch 54, valid result = {Average: 0.155}
[32mINFO 2021-11-16 16:23:25 : [0mEarly stopping patience decrease to 36
2021-11-16 16:23:28 : Epoch 55, valid result = {Average: 0.127}
[32mINFO 2021-11-16 16:23:28 : [0mEarly stopping patience decrease to 35
2021-11-16 16:23:32 : Epoch 56, valid result = {Average: 2.236}
[32mINFO 2021-11-16 16:23:32 : [0mEarly stopping patience decrease to 34
2021-11-16 16:23:37 : Epoch 57, valid result = {Aver

In [14]:
'''Generate texts for source inputs'''

!python generation_script.py\
    -src_texts_file {train_src_file}\
    -model_dir {save_model_dir}\
    -gen_texts_file {gen_output_file}\
    -infer_method sample\
    -sample_p 0.7\
    -verbose\


{'model_dir': 'infilling_test_model', 'gen_texts_file': 'test/toy_data/infilling/train.gen', 'src_texts_file': 'test/toy_data/infilling/train.src', 'max_decoding_length': 100, 'n_gen_per_src': 1, 'batch_size': 64, 'min_postproc_length': None, 'max_postproc_length': None, 'max_redundancy_rate': None, 'block_repeat': False, 'block_quotes': False, 'block_profanity': False, 'require_paired_punct': False, 'require_eos_punct': False, 'require_src_in_gen': False, 'force_src_in_regen': False, 'max_gen_attempts': 1, 'fallback_to_src': False, 'infer_method': 'sample', 'sample_top_k': 0, 'sample_p': 0.7, 'sample_temperature': 1.0, 'verbose': True}
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.construct:Loaded LM model from infilling_test_model/1637108773.7886982.pt
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.generate:Starting generation round 1, attempt 1...
INFO:texgen.generate:rou

In [15]:
'''View generated texts'''

with open(train_src_file) as src_f,\
    open(gen_output_file) as gen_f:
    src_texts = [text.strip() for text in src_f]
    gen_texts = [text.strip() for text in gen_f]

for src, gen in zip(src_texts, gen_texts):
    print("{:15s} {}".format("SOURCE:", src))
    print("{:15s} {}\n".format("GENERATED:", gen))

SOURCE:         Harry Potter Philosopher's Stone
GENERATED:      Harry Potter and the Philosopher's Stone

SOURCE:         Harry Potter and Secrets
GENERATED:      Harry Potter and the Chamber of Secrets

SOURCE:         Harry the Prisoner of Azkaban
GENERATED:      Harry Potter and the Prisoner of Azkaban

SOURCE:         Potter and the Goblet of
GENERATED:      Harry Potter and the Goblet of Fire

SOURCE:         Harry Potter the Order the Phoenix
GENERATED:      Harry Potter and the Order of the Phoenix

SOURCE:         and the Half-Blood Prince
GENERATED:      Harry Potter and the Half-Blood Prince

SOURCE:         Harry Hallows
GENERATED:      Harry Potter and the Deathly Hallows



##### For fun: demo infilling model already trained on 10K fiction books

You can download the model loaded below [here](https://drive.google.com/file/d/18E8IT__33bU24Nqws-9amY_obHZ0jVNG/view?usp=sharing).

In [16]:
'''Generate texts for source inputs'''

model_dir = "../Documents/insentivize_trained_models/bookcorpus_10K_rand_drop/"

!python generation_script.py\
    -src_texts_file {train_src_file}\
    -model_dir {model_dir}\
    -gen_texts_file {gen_output_file}\
    -infer_method sample\
    -sample_p 0.7\
    -max_gen_attempts 5\
    -require_src_in_gen\
    -verbose\


{'model_dir': '../Documents/insentivize_trained_models/bookcorpus_10K_rand_drop/', 'gen_texts_file': 'test/toy_data/infilling/train.gen', 'src_texts_file': 'test/toy_data/infilling/train.src', 'max_decoding_length': 100, 'n_gen_per_src': 1, 'batch_size': 64, 'min_postproc_length': None, 'max_postproc_length': None, 'max_redundancy_rate': None, 'block_repeat': False, 'block_quotes': False, 'block_profanity': False, 'require_paired_punct': False, 'require_eos_punct': False, 'require_src_in_gen': True, 'force_src_in_regen': False, 'max_gen_attempts': 5, 'fallback_to_src': False, 'infer_method': 'sample', 'sample_top_k': 0, 'sample_p': 0.7, 'sample_temperature': 1.0, 'verbose': True}
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.construct:Loaded LM model from ../Documents/insentivize_trained_models/bookcorpus_10K_rand_drop/1607536509.0371842.pt
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.

In [17]:
'''View generated texts'''

with open(train_src_file) as src_f,\
    open(gen_output_file) as gen_f:
    src_texts = [text.strip() for text in src_f]
    gen_texts = [text.strip() for text in gen_f]

for src, gen in zip(src_texts, gen_texts):
    print("{:15s} {}".format("SOURCE:", src))
    print("{:15s} {}\n".format("GENERATED:", gen))

SOURCE:         Harry Potter Philosopher's Stone
GENERATED:      The Adventures of Harry Potter and the Philosopher's Stone

SOURCE:         Harry Potter and Secrets
GENERATED:      There were some that said 'Harry Potter and Secrets'.

SOURCE:         Harry the Prisoner of Azkaban
GENERATED:      Harry remembered that the Prisoner was now in the care of Azkaban.

SOURCE:         Potter and the Goblet of
GENERATED:      "Potter's gory: true and true, in that it is true that the Goblet of the Dead

SOURCE:         Harry Potter the Order the Phoenix
GENERATED:      The first thing I saw was the back of the door of the residence of Harry Potter and the Order of the Phoenix.

SOURCE:         and the Half-Blood Prince
GENERATED:      She was holding out the green and white stripes, which were also the real face of the half-blood prince.

SOURCE:         Harry Hallows
GENERATED:      Harry I. Glenn Hallows: In his "life of innovation".



#### Completion

In [18]:
# Completion toy data
train_src_file = "test/toy_data/completion/train.src"
train_tgt_file = "test/toy_data/completion/train.tgt"
model_config_file = "test/test_configs/gpt2_lm_config.json"
save_model_dir = "completion_test_model"
gen_output_file = "test/toy_data/completion/train.gen"

In [19]:
'''Show format of source-target pairs used for training'''

with open(train_src_file) as src_f,\
    open(train_tgt_file) as tgt_f:
    src_texts = [text.strip() for text in src_f]
    tgt_texts = [text.strip() for text in tgt_f]

for src, tgt in zip(src_texts, tgt_texts):
    print("{:15s} {}".format("SOURCE:", src))
    print("{:15s} {}\n".format("TARGET:", tgt))

SOURCE:         Harry Potter and the Philosopher's
TARGET:         Stone

SOURCE:         Harry Potter and the Chamber
TARGET:         of Secrets

SOURCE:         Harry Potter and the Prisoner
TARGET:         of Azkaban

SOURCE:         Harry Potter and the Goblet of
TARGET:         Fire

SOURCE:         Harry
TARGET:         Potter and the Order of the Phoenix

SOURCE:         Harry Potter and the Half-Blood
TARGET:         Prince

SOURCE:         Harry Potter and the
TARGET:         Deathly Hallows



In [24]:
'''Train a model to predict target texts from source inputs'''

!python train_script.py\
    -train_src_file {train_src_file}\
    -train_tgt_file {train_tgt_file}\
    -eval_src_file {train_src_file}\
    -eval_tgt_file {train_tgt_file}\
    -config_file {model_config_file}\
    -save_dir {save_model_dir}\
    -patience 100\
    -valid_epoch_end


{'save_dir': 'completion_test_model', 'config_file': 'test/test_configs/gpt2_lm_config.json', 'train_src_file': 'test/toy_data/completion/train.src', 'train_tgt_file': 'test/toy_data/completion/train.tgt', 'eval_src_file': 'test/toy_data/completion/train.src', 'eval_tgt_file': 'test/toy_data/completion/train.tgt', 'train_ref_file': None, 'eval_ref_file': None, 'max_src_length': 25, 'max_tgt_length': 75, 'load_from_dir': 'completion_test_model', 'pg_metrics': [], 'eval_metrics': [], 'batch_size': 32, 'max_epochs': 50, 'learning_rate': 0.001, 'patience': 50, 'dynamic_lr': False, 'warmup_steps': 4000, 'max_grad_norm': 5.0, 'accum_steps': 1, 'log_iterations': 100, 'valid_iterations': 1000, 'valid_epoch_end': True}
INFO:texgen.train:Loading model configuration from completion_test_model. All hyperparameter settings will be read from here and will override any settings provided as command-line arguments.
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-smal

2021-11-16 15:52:47 : Epoch 29, valid result = {Average: 0.568}
[32mINFO 2021-11-16 15:52:47 : [0mEarly stopping patience decrease to 25
2021-11-16 15:52:51 : Epoch 30, valid result = {Average: 0.706}
[32mINFO 2021-11-16 15:52:51 : [0mEarly stopping patience decrease to 24
2021-11-16 15:52:55 : Epoch 31, valid result = {Average: 0.854}
[32mINFO 2021-11-16 15:52:55 : [0mEarly stopping patience decrease to 23
2021-11-16 15:52:59 : Epoch 32, valid result = {Average: 0.624}
[32mINFO 2021-11-16 15:52:59 : [0mEarly stopping patience decrease to 22
2021-11-16 15:53:03 : Epoch 33, valid result = {Average: 0.429}
[32mINFO 2021-11-16 15:53:03 : [0mEarly stopping patience decrease to 21
2021-11-16 15:53:07 : Epoch 34, valid result = {Average: 0.172}
[32mINFO 2021-11-16 15:53:07 : [0mPrevious checkpoint 1637106688.171586.pt removed due to `max_to_keep`(=1) limit
[32mINFO 2021-11-16 15:53:09 : [0mCurrent checkpoint saved to completion_test_model/1637106787.693065.pt
2021-11-16 15:53:1

In [20]:
'''Generate texts for source inputs'''

!python generation_script.py\
    -src_texts_file {train_src_file}\
    -model_dir {save_model_dir}\
    -gen_texts_file {gen_output_file}\
    -infer_method sample\
    -sample_p 0.7\
    -verbose\

{'model_dir': 'completion_test_model', 'gen_texts_file': 'test/toy_data/completion/train.gen', 'src_texts_file': 'test/toy_data/completion/train.src', 'max_decoding_length': 100, 'n_gen_per_src': 1, 'batch_size': 64, 'min_postproc_length': None, 'max_postproc_length': None, 'max_redundancy_rate': None, 'block_repeat': False, 'block_quotes': False, 'block_profanity': False, 'require_paired_punct': False, 'require_eos_punct': False, 'require_src_in_gen': False, 'force_src_in_regen': False, 'max_gen_attempts': 1, 'fallback_to_src': False, 'infer_method': 'sample', 'sample_top_k': 0, 'sample_p': 0.7, 'sample_temperature': 1.0, 'verbose': True}
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.construct:Loaded LM model from completion_test_model/1637106865.263202.pt
Using cached pre-trained GPT2 checkpoint from /Users/mroemmele/texar_data/GPT2/gpt2-small.
INFO:texgen.generate:Starting generation round 1, attempt 1...
INFO:texgen.generate:

In [21]:
'''View generated texts'''

with open(train_src_file) as src_f,\
    open(gen_output_file) as gen_f:
    src_texts = [text.strip() for text in src_f]
    gen_texts = [text.strip() for text in gen_f]

for src, gen in zip(src_texts, gen_texts):
    print("{:15s} {}".format("SOURCE:", src))
    print("{:15s} {}\n".format("GENERATED:", gen))

SOURCE:         Harry Potter and the Philosopher's
GENERATED:      Stone

SOURCE:         Harry Potter and the Chamber
GENERATED:      of Secrets

SOURCE:         Harry Potter and the Prisoner
GENERATED:      of Azkaban

SOURCE:         Harry Potter and the Goblet of
GENERATED:      Fire

SOURCE:         Harry
GENERATED:      Potter and the Order of the Phoenix

SOURCE:         Harry Potter and the Half-Blood
GENERATED:      Prince

SOURCE:         Harry Potter and the
GENERATED:      Deathly Hallows

