# Fine Tune Distilgpt2 
This notebook takes DistilGPT2 and fine tunes it with a short blurb.  That result is compared with the pre-trained model. 
Next step in 008A is to substitute the short blurb with a Q&A document.  

In [1]:
# Load Libraries 
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx

In [2]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text


In [4]:
# Read documents from the directory
train_directory = "C:\\Users\\patri\\projects\\GenAI\\data\\privacy\\train_directory"
text_data = read_documents_from_directory(train_directory)
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters
text_data = re.sub(r'[^A-Za-z0-9 ]+', '', text_data) # Remove any non-alphameric characters 

In [5]:
with open("C:\\Users\\patri\\projects\\GenAI\\data\\privacy\\model_cache\\train.txt", "w") as f:
    f.write(text_data)

In [6]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [7]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [8]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [9]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [10]:
# Set the hyper-parameters 
train_file_path = "C:\\Users\\patri\\projects\\GenAI\\data\\privacy\\model_cache\\train.txt"
model_name = 'gpt2'
output_dir = "C:\\Users\\patri\\projects\\GenAI\\data\\privacy\\refinded_models"
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000

In [11]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss


Inference

In [12]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [13]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

This model got trained on the entire text and took much longer to train, but gives a decent answer.  

In [15]:
model1_path = "C:\\Users\\patri\\projects\\GenAI\\data\\privacy\\refinded_models"
sequence1 = "[Q] What is the Privacy Act of 1974?"
max_len = 256
generate_text(model1_path, sequence1, max_len) 

[Q] What is the Privacy Act of 1974?  The Privacy Act of 1974 was a bill Congress enacted to provide for the management of personal information  It was the first comprehensive privacy law in the country to allow the government to regulate the use of personal information by federal agencies  It was later modified by the Computer Matching and Privacy Protection Act of 1988 gifmitting the Privacy Act jurisdiction over computermatching and matching activities by agencies  These included the Department of Health and Human Services  Office of Privacy and Civil Liberties OVERVIEW OF THE PRIVACY ACT Regulation EU 2016679 of the European Parliament and of the Council of 27 October 2016 on the Protection of Natural Persons with Regard to the Processing of Personal Data and on the Free Movement of Such Data known as the General Data Protection Regulation GDPR The GrammLeachBliley Act 14 USC  6801 The Health Insurance Portability and Accountability Act of 1974 Pub L No 100503 102 Stat 2507 extendi

The following was from an untrained model.   

In [18]:
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2')
generator("What is the Privacy Act of 1974?", max_length=256, num_return_sequences=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'What is the Privacy Act of 1974?\n\nPowers of the Privacy Act\n\nThe powers, powers, and procedures of the Privacy Act of 1974 may be broadly construed to cover any and all matters concerning:\n\npersonal information collected to fulfil the Privacy Act;\n\ninformation that is provided in any form, process, or form for the dissemination of certain data, or\n\nthe collection of personal information by a third‑party; or\n\nthe collection of personal information through surveillance to ensure the integrity of the privacy of certain individuals.\n\nThis is based on the principle that access to personal information through surveillance warrants should also ensure confidentiality, confidentiality protected by law, and also should not be used for the collection of personal information.\n\nHow can I prevent collecting personal data by the Privacy Act?\n\nThe Privacy Act should enable people to do absolutely no harm. The Act does allow for the police to obtain information th