<a href="https://colab.research.google.com/github/propenster/GPTExamLLM/blob/main/GPTExamLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Welcome to ExamLLM**
ExamLLM is a large language model to generate exam/mock questions and multiple choice answers from any corpus provided in prompt.

**CodeSection - models.py**

In [5]:
! pip install -U accelerate
! pip install -U transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.0
    Uninstalling transformers-4.30.0:
      Successfully uninstalled transformers-4.30.0
Successfully installed tokenizers-0.15.0 transformers-4.35.2


In [27]:
"""
Fine-tuning the library models for examination on a text file (GPT, GPT-2).
We fine-tuned GPT-2 to generate multiple-choice type questions and answer from any text corpus
.
"""

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

model_name = "gpt2"
train_input_file = "sample_data/train_gptexam.txt"
out_dir = "./out_model/fine-tuned-gptexam-model"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

#eos_token = '\n'

# Customize the tokenizer with the new end-of-stream token
#special_tokens_dict = {'eos_token': eos_token}
#num_added_tokens = tokenizer.add_tokens(eos_token, special_tokens=True)

# Resize model embeddings to match the new tokenizer
#model.resize_token_embeddings(len(tokenizer))



# loading and preprocessing tokenized train data
train_dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = train_input_file,
    block_size = 128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)

#setup trainig args
training_args = TrainingArguments(
    output_dir = out_dir,
    overwrite_output_dir = True,
    num_train_epochs = 3,
    per_device_train_batch_size = 4,
    save_steps = 10_000,
    save_total_limit = 2
)

# set trainer and fine_tune
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
)

#train
trainer.train()

#save the fine-tuned model to outdir
model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)


















Step,Training Loss


('./out_model/fine-tuned-gptexam-model/tokenizer_config.json',
 './out_model/fine-tuned-gptexam-model/special_tokens_map.json',
 './out_model/fine-tuned-gptexam-model/vocab.json',
 './out_model/fine-tuned-gptexam-model/merges.txt',
 './out_model/fine-tuned-gptexam-model/added_tokens.json')

**Our Evaluator** - RunGen.py

In [28]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import random


def generate_multiple_choice_questions(input_text, model, tokenizer, num_questions=3):
  #tokenize prompt
  input_ids = tokenizer.encode(input_text, return_tensors="pt")

  eos_token_id = model.config.eos_token_id

  print(f"eos_token_id: {eos_token_id}")
  # Generate
  output = model.generate(
      input_ids,
      max_length=len(input_ids[0]) + 50,
      num_beams = 5,
      num_return_sequences = num_questions,
      no_repeat_ngram_size = 2,
      pad_token_id = eos_token_id,
      attention_mask = input_ids != eos_token_id,
  )

  # decode and return
  generated_question = tokenizer.decode(output[0], skip_special_tokens=True)

  # Generate options (a), (b), (c) randomly
  #options = [f"({chr(97 + i)}) {tokenizer.decode(model.generate(input_ids, max_length=20)[0], skip_special_tokens=True)}" for i in range(3)]

  # Randomly choose one option as the correct answer
  correct_answer = "(b) " if "b)" in generated_question else "(a) "

  # Generate options (a), (b), (c)
  options = [
        f"(a) {tokenizer.decode(model.generate(input_ids, max_length=20)[0], skip_special_tokens=True)}",
        f"(b) {tokenizer.decode(model.generate(input_ids, max_length=20)[0], skip_special_tokens=True)}",
        f"(c) {tokenizer.decode(model.generate(input_ids, max_length=20)[0], skip_special_tokens=True)}"
    ]

  return generated_question, options, correct_answer



def main():
    # Load fine-tuned model and tokenizer
    model_path = "./out_model/fine-tuned-gptexam-model"
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)

    # Example input text
    input_text = "The organelle responsible for energy or power generation in the cell is the mitochondria. The greenhouse effect is a natural process that warms the Earth's surface. Ribosomes are located in the cytoplasm of the cell and they help me with protein synthesis."

    # Use our fine-tuned model to generate multiple-choice questions

    generated_question, options, correct_answer = generate_multiple_choice_questions(input_text, model, tokenizer)

    # Print the generated question, options, and correct answer
    print(f"Generated Question: {generated_question}")
    print(f"Options: {', '.join(options)}")
    print(f"Correct Answer: {correct_answer}")

    # # Print the generated questions
    # for i, question in enumerate(generated_questions, 1):
    #     print(f"Question {i}: {question}")

if __name__ == "__main__":
    main()






eos_token_id: 50256


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Question: The organelle responsible for energy or power generation in the cell is the mitochondria. The greenhouse effect is a natural process that warms the Earth's surface. Ribosomes are located in the cytoplasm of the cell and they help me with protein synthesis.

Ribosome formation is an important part of cell function. It is important for the formation of proteins and nucleic acids that are needed for cell division and cell growth. In the body, the ribosomal protein, which is responsible
Options: (a) The organelle responsible for energy or power generation in the cell is the mitochondria. The greenhouse effect is a natural process that warms the Earth's surface. Ribosomes are located in the cytoplasm of the cell and they help me with protein synthesis. The, (b) The organelle responsible for energy or power generation in the cell is the mitochondria. The greenhouse effect is a natural process that warms the Earth's surface. Ribosomes are located in the cytoplasm of the ce