# GPT-3.5-Turbo Model
Creating a question answering chatbot using GPT-3.5. Adapted from: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

num_tokens() IS OFTEN USED IN KNOWLEDGE.PY (and other files??) WITHOUT SPECIFYING THE EMBEDDING MODEL.

In [1]:
# !pip install datasets, sentencepiece, transformers, accelerate, tiktoken, rouge_score, evaluate rouge_score bleu_score
import sys

import openai.error

sys.path.append("modules")
from modules.config import *
from modules.knowledge import *
from modules.chatbot import *
from modules.embedding_functions import *
from modules.data_extraction import *
from modules.data_preprocessing import *
from modules.gpt_ans_extraction import *
from modules.query import *
sys.path.remove("modules")

No GPU available, using a CPU


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


No GPU available, using a CPU


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\point\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Create Knowledge Base

In [None]:
textbooks = ['Digital_Image_Processing_Textbook', 'Fundamentals_of_Digital_Image_Processing_Textbook']
CompVisionKnowledge = Knowledge('CompVisionPDF', 'BERT') # Use BERT embeddings to calculate cosine similarity
for page in WIKI_PAGES:
    CompVisionKnowledge.append_wikipedia_page(WIKI_PAGE)
for textbook in textbooks:
    CompVisionKnowledge.append_pdf(f'assets/knowledge/{textbook}.pdf', textbook)
CompVisionKnowledge.export_to_csv(GPT_KNOWLEDGE_FILENAME)
CompVisionKnowledge.df

### NQ Data Extraction and Preprocessing

In [None]:
### Data extraction
training = AllData(cache_dir='/content/drive/MyDrive/Diss/Datasets', default='dev') # NEED TO CORRECT THIS (train/test separation for efficiency)
training.export_simplified_dataset(path="/content/drive/MyDrive/Diss/Output/simplified_dataset_validation_new.csv")

### Data preprocessing
training_data = TrainingData(save_dir=f'{OUTPUT_DIR}/all_data_{short_model_name}')

In [3]:
# Load the dataset and extract only the answerable questions
OUTPUT_DIR = 'G:\My Drive\Diss\Output'
short_model_name = "bart-large-xsum"
all_data = load_from_disk(f'{OUTPUT_DIR}/all_data_{short_model_name}').shuffle(seed=9)
all_ans_data = all_data.filter(lambda row: (row["answer"] != NO_ANS_TOKEN))
all_ans_data

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'url', 'content', 'question', 'answer', 'start_token', 'end_token', 'length', 'input_ids', 'attention_mask', 'labels', 'num_tokens'],
        num_rows: 11504
    })
    test: Dataset({
        features: ['id', 'title', 'url', 'content', 'question', 'answer', 'start_token', 'end_token', 'length', 'input_ids', 'attention_mask', 'labels', 'num_tokens'],
        num_rows: 2876
    })
})

In [None]:
# Iterate through the dataset and query GPT to extract a natural-language answer
splits = ['test', 'train']
for s in splits:
    all_timestamps = []
    responses = []
    # Obtain GPT answers/responses
    for i in tqdm(range(len(all_ans_data[s]))): # Due to the RPM
        all_timestamps = pause_if_needed(all_timestamps)

        formatted_text = format_request(all_ans_data[s][i])
        inputs = [
                {"role": "system", "content": f"You answer questions by only using a provided context."},
                {"role": "user", "content": formatted_text},
            ]
        response, all_timestamps = query_gpt(inputs, all_timestamps)
        responses.append(response.choices[0].message.content)

    # Export to txt file
    file_name = f"{s}_all.txt"
    # Open the file in write mode and save the list to the file
    with open(file_name, "w", encoding="utf-8") as file:
        for item in responses:
            file.write(item + "\n")

    # Export as HF dataset
    df_pandas = all_ans_data[s].to_pandas()
    df_pandas['gpt_ans'] = responses
    new_dataset = Dataset.from_pandas(df_pandas)
    new_dataset.save_to_disk(f"{OUTPUT_DIR}/{short_model_name}_{s}_split")

  3%|▎         | 86/2876 [02:16<38:52,  1.20it/s]   

Major error The server is overloaded or not ready yet.


 18%|█▊        | 506/2876 [15:25<36:59,  1.07it/s]   

Major error The server is overloaded or not ready yet.


 27%|██▋       | 771/2876 [23:47<31:57,  1.10it/s]   

In [None]:
# Now the new (GPT) answers can be merged with the original dataset
# Load the original data and GPT data generated above
train_dataset_pandas = training_data.training_data['train'].to_pandas()
test_dataset_pandas = training_data.training_data['test'].to_pandas()
# updated_train_dataset_pandas = load_from_disk(f"assets/{short_model_name}_train_split").to_pandas()
# updated_test_dataset_pandas = load_from_disk(f"assets/{short_model_name}_test_split").to_pandas()
updated_train_dataset_pandas = load_from_disk(f"{OUTPUT_DIR}/{short_model_name}_train_split").to_pandas()
updated_test_dataset_pandas = load_from_disk(f"{OUTPUT_DIR}/{short_model_name}_test_split").to_pandas()

# Merge on id and keep all rows
train_merged = pd.merge(train_dataset_pandas, updated_train_dataset_pandas[['id', 'gpt_ans']], on='id', how='outer')
test_merged = pd.merge(test_dataset_pandas, updated_test_dataset_pandas[['id', 'gpt_ans']], on='id', how='outer')

In [None]:
# Remove any examples where it apparently had an answer but GPT couldn't extract oen
train_merged = train_merged[train_merged['gpt_ans'] != NO_ANS_TOKEN]
test_merged = test_merged[test_merged['gpt_ans'] !=  NO_ANS_TOKEN]

# Update the answers to match the GPT ones
train_merged.loc[train_merged['answer'] != NO_ANS_TOKEN, 'answer'] = train_merged.loc[train_merged['answer'] != NO_ANS_TOKEN, 'gpt_ans']
test_merged.loc[test_merged['answer'] != NO_ANS_TOKEN, 'answer'] = test_merged.loc[test_merged['answer'] != NO_ANS_TOKEN, 'gpt_ans']

merged_dataset = DatasetDict({
        "train": Dataset.from_pandas(train_merged),
        "test": Dataset.from_pandas(test_merged),
    })

# Maintain the answerable/non-answerable balance
def ensure_ans_non_ans_balance(dataset, dataset_splits=('training', 'validation'), proportion=0.3, seed=SEED):
    # Extracting the unanswerable examples
    no_ans = dataset.filter(lambda row: (row["answer"] == NO_ANS_TOKEN))
    good_ans = dataset.filter(lambda row: (row["answer"] != NO_ANS_TOKEN))

    # Discarding some unanswerable examples so the answer-no_ans ratio is favourable in each split
    processed_datasets_dict = {}
    for split in dataset_splits:
      num_no_ans = proportion*len(good_ans[split])/(1-proportion)
      no_ans_keep = no_ans[split].train_test_split(train_size=num_no_ans/len(no_ans[split]), seed=seed)['train']
      processed_datasets_dict[split] = concatenate_datasets([no_ans_keep, good_ans[split]])

    processed_dataset = DatasetDict({
            dataset_splits[0]: processed_datasets_dict[dataset_splits[0]],
            dataset_splits[1]: processed_datasets_dict[dataset_splits[1]],
        })
    shuffled_dataset = processed_dataset.shuffle(seed=seed)
    return shuffled_dataset

final_dataset = ensure_ans_non_ans_balance(merged_dataset, dataset_splits=['train', 'test'])

# As can be seen, the 80/20 train-test split has been maintained
len(final_dataset['train'])/(len(final_dataset['train'])+len(final_dataset['test']))

In [None]:
# Finally, the questions/contexts and answers need to be tokenised to override previous values
def tokenise(data):
    # tokenize the inputs (questions and contexts)
    additional_cols = tokeniser(data['content'], data['question'], truncation=False)

    # tokenize the answers
    targets = tokeniser(text_target=data['answer'], truncation=False)

    #set labels
    additional_cols['labels'] = targets['input_ids']
    additional_cols['num_tokens'] = [len(row) for row in additional_cols["input_ids"]]
    return additional_cols

final_dataset = final_dataset.map(tokenise, batched = True)
final_dataset.save_to_disk(f'{OUTPUT_DIR}/all_data_{short_model_name}_gpt_updated')

# NQ Model Training

In [None]:
# Load data
all_data = load_from_disk(f'{OUTPUT_DIR}/all_data_{short_model_name}_gpt_updated') # add or remove the suffix a required

# Connect to HuggingFace
HfFolder.save_token("ADD_TOKEN_HERE")
!git config --global user.email "pointon.joel@gmail.com"
!git config --global user.name "Joel Pointon"

# Training arguments/config
batch_size = 8 # 64
num_train_epochs = 8
logging_steps = len(all_data["training"]) // batch_size # Show the training loss with every epoch
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}_updated",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    push_to_hub=True,
    seed=9,
    # optim="adafactor"
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokeniser.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokeniser.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokeniser.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

data_collator = DataCollatorForSeq2Seq(tokeniser, model=model)

def model_init():
  return AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=all_data["train"],
    eval_dataset=all_data["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics,
)

path = f"/content/drive/MyDrive/Diss/Output/{short_model_name}-finetuned-natural-questions"
trainer.train()
trainer.push_to_hub()
trainer.save_model(path)

In [None]:
# NEED TO FIX THIS
filename = 'assets/knowledge/Fundamentals_of_Digital_Image_Processing_Textbook.pdf'
doc = fitz.open(filename)
page_limit = None
all_text = ''
# Iterate through the content
for page in doc:
    page_limit = doc.page_count if not page_limit else page_limit
    if page.number <= page_limit:
        block_content = page.get_text("blocks") #.encode("utf8") # "blocks"
        for block in block_content:
            if block[6] == 0:  # I.e. only extract text
                plain_text = unidecode(block[4])  # .decode('latin1') #.decode('utf-8')
                all_text += plain_text
    else:
        pass

### MLM Training

In [None]:
# Extracting training content
for textbook in textbooks:
    all_text = ''.join(CompVisionKnowledge.df.loc[CompVisionKnowledge.df['Source']==textbook, 'Content'].tolist())
    with open(f'assets/knowledge/{textbook}.txt', "w") as f:
        f.write(all_text)

In [None]:
# Read txt files
def read_txt(file_path):
    with open(file_path, "r") as f:
        text = f.read()
    return text

# For formatting a PDF as a HF datasets
def get_text_dataset(path):
  dataset_obj = TextDataset(
        tokenizer = tokeniser,
        file_path = path,
        block_size = 512,
    )
  return dataset_obj

# Read documents from the directory
training_file = 'assets/knowledge/Digital_Image_Processing_Textbook.txt'
validation_file = 'assets/knowledge/Fundamentals_of_Digital_Image_Processing_Textbook.txt'
train_data = read_txt(training_file)
validation_data = read_txt(validation_file)
train_data = re.sub(r'\n+', '\n', train_data).strip()  # Remove excess newline characters
validation_data = re.sub(r'\n+', '\n', validation_data).strip()  # Remove excess newline characters

with open(f'{training_file}_formatted', "w") as f:
    f.write(train_data)
with open(f'{validation_file}_formatted', "w") as f:
    f.write(validation_data)

model_name = 'gpt2'
output_dir = 'models/custom_q_and_a'
batch_size = 8
num_train_epochs = 8

# Train
tokeniser = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
if tokeniser.pad_token is None:
    tokeniser.add_special_tokens({'pad_token': '[PAD]', 'mask_token': '[MASK]'})
    model.resize_token_embeddings(len(tokeniser))


train_dataset = get_text_dataset(f'{training_file}_formatted')
validation_dataset = get_text_dataset(f'{validation_file}_formatted')

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokeniser,
        mlm=True,
    )

HF_REFERENCE='mlm'
logging_steps = len(train_dataset) // batch_size // 4
training_args = TrainingArguments(
    output_dir=HF_REFERENCE,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    logging_steps=logging_steps,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    push_to_hub=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)
trainer.train()
import math
# eval_results = trainer.evaluate()
# print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
trainer.push_to_hub()
tokeniser.push_to_hub(f'psxjp5/{HF_REFERENCE}')
print('Finished training and pushed to hub!')

# Usage

In [None]:
# Todo:
# I need to make it more efficient on the number of tokens
# Check num_tokens usage and specifying the encoding model
# Fix the potential issue of GPT sections being longer than 1024 tokens when using BART

In [None]:
# MLM model
formatted_text = 'What is PCA?'
question_answering = pipeline(model=f'psxjp5/mlm')
print(question_answering(formatted_text)[0]['generated_text'])

In [None]:
CompVisionGPT = ChatBot("Computer Vision", 'assets/' + GPT_KNOWLEDGE_FILENAME)
print(Query.ask('What is PCA?', CompVisionGPT, show_source=True))

In [None]:
CompVisionBERT = ChatBot("Computer Vision", 'assets/' + BERT_KNOWLEDGE_FILENAME)
print(Query.ask_bert('When did universities begin teaching Computer Vision?', CompVisionBERT))

In [None]:
CompVisionGPT = ChatBot("Computer Vision", 'assets/' + GPT_KNOWLEDGE_FILENAME)
print(Query.ask_bart('When did Universities begin teaching Computer Vision?', CompVisionGPT, show_source=True))  # What if the GPT knowledge sections are longer than 1024 tokens?? Need to account for this!