# Custom data

In [3]:
import re
from sklearn.model_selection import train_test_split

# Script parses RAW_INPUT_PATH, which is a dump from dynalist, and outputs
# two files: OUTPUT_TRAIN_PATH, OUTPUT_TEST_PATH, which are preprocessed sentences, split into train and test
RAW_INPUT_PATH = 'research_dump.txt'
TRAIN_PATH = 'train.txt'
TEST_PATH = 'test.txt'
MODEL_NAME = 'gpt2'

"""
@param sentence: string.
@return bool. Whether or not the sentence should be included as part of training.
"""
def is_valid_sentence(sentence):
    return len(sentence) > 10

"""
@param sentence: string.
@return string. The preprocessed sentence.
"""
def preprocess_sentence(sentence):
    return re.sub(r"\*\*", "", sentence).strip()

sentences = []
with open(RAW_INPUT_PATH, encoding='utf-8') as f:
    for sentence in f:
        sentence = preprocess_sentence(sentence)
        if not is_valid_sentence(sentence):
            continue
        sentences.append(sentence)

train_sentences, test_sentences = train_test_split(sentences, test_size=0.10)

with open(TRAIN_PATH, "w", encoding='utf-8') as outfile:
    outfile.write("\n".join(train_sentences))
    
with open(TEST_PATH, "w", encoding='utf-8') as outfile:
    outfile.write("\n".join(test_sentences))

In [4]:
# 15K sentences
print(len(sentences))

15088


# Fine tuning

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [6]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=4)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=4)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator
# TODO: What is data collator?
train_dataset, test_dataset, data_collator = load_dataset(TEST_PATH, TEST_PATH, tokenizer)

In [7]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

MODEL_OUTPUT_DIR = "./finetuned-gpt2"
model = AutoModelWithLMHead.from_pretrained(MODEL_NAME)

training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=1,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    prediction_loss_only=True,
)



# Train the model

In [8]:
trainer.train()

Step,Training Loss
500,6.203875
1000,6.108259
1500,6.00191
2000,5.939924
2500,6.023035
3000,5.87452
3500,5.874719
4000,5.739141
4500,5.800875
5000,5.816199


TrainOutput(global_step=23325, training_loss=4.615244172025723)

In [9]:
trainer.save_model()

# Text generation demo

In [10]:
from transformers import pipeline

MODEL_OUTPUT_DIR = "./finetuned-gpt2"
# TODO: Set [model, tokenizer]
# TODO: Set config. See https://huggingface.co/transformers/main_classes/configuration.html#transformers.PretrainedConfig
# max_length, min_length, temperature, do_sample=True, top_k = 50, repetition_penalty, num_return_sequences
config = {
    'max_length': 800,
    'num_return_sequences': 3
}
generator = pipeline('text-generation',model=MODEL_OUTPUT_DIR, tokenizer='gpt2',config=config)


[]

In [34]:
generator("The number of neurons in C elegans is")[0]['generated_text']

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


'The number of neurons in C elegans is represented by an integer, not the number of knobs, not the number of knobs, not the fixed point, if no fixed, and can be varied, one can have the number of different fixed'

In [35]:
generator("The number of neurons in the human brain is")[0]['generated_text']

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


'The number of neurons in the human brain is fixed and the number of knobs can be modelled for each knobs. The number of knobs are then fixed and constant.\nzero is defined for each state constant constant, not constant. If'

In [None]:
"""
TODO:
break \n's into separate sentences and retrain
Play with the temperature, and autogenerate lots of text so you can just read
"""