In [13]:
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from dotenv import load_dotenv
import gc


gc.collect()
torch.cuda.empty_cache()

load_dotenv()


True

In [14]:
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        combined_text += read_txt(file_path)
    return combined_text


In [15]:
def train_chatbot(directory, model_output_path, train_fraction=0.8):

    combined_text = read_documents_from_directory(directory)
    combined_text = re.sub(r'\n+', '\n', combined_text).strip() 

    split_index = int(train_fraction * len(combined_text))
    train_text = combined_text[:split_index]
    val_text = combined_text[split_index:]

    with open("train.txt", "w") as f:
        f.write(train_text)
    with open("val.txt", "w") as f:
        f.write(val_text)

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
    model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

    
    train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)
    val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        per_device_train_batch_size=1, 
        per_device_eval_batch_size=1,
        num_train_epochs=100,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        hub_strategy="checkpoint",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train(resume_from_checkpoint=True)
    trainer.save_model(model_output_path)
    
    tokenizer.save_pretrained(model_output_path)


In [16]:
def generate_response(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [17]:
def main():
    project_directory = os.getenv("PROJECT_DIR")
    data_directory = os.path.join(project_directory, r"Data/cleaned_data")
    model_output_path = os.path.join(project_directory, r"Models/")

    train_chatbot(data_directory, model_output_path)

    model = GPT2LMHeadModel.from_pretrained(model_output_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

    prompt = "What is Canvas?"  
    response = generate_response(model, tokenizer, prompt)
    print("Generated response:", response)

In [18]:
if __name__ == "__main__":
    main()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
  state_dict = torch.load(weights_file, map_location="cpu")
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
240500,0.0299
241000,0.0256
241500,0.0276
242000,0.0261
242500,0.0267
243000,0.0278
243500,0.029
244000,0.0292
244500,0.0296
245000,0.0242


  return torch.load(checkpoint_file, map_location=map_location)


Generated response: What is Canvas?
Canvas is a Learning Management System. To learn more about Canvas terminology and definitions, visit How does Canvas define the terms used to describe its features and functions?
Because Canvas is a web-based system, it doesnâ€™t need to be installed on your computer. However, youâ€™ll want to make sure that your computer and web browser meet the basic requirements to run Canvas.
What is the Instructor/Teacher role?



In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import os
from dotenv import load_dotenv

load_dotenv()

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


True

In [3]:
def generate_response(model, tokenizer, prompt, max_length=250):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [4]:
project_directory = os.getenv("PROJECT_DIR")
model_path = os.path.join(project_directory, r"Models/")

my_chat_model = GPT2LMHeadModel.from_pretrained(model_path)
my_chat_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

  return torch.load(checkpoint_file, map_location=map_location)


In [6]:
prompt = "Can students view unpublished assignments?"  

response = generate_response(my_chat_model, my_chat_tokenizer, prompt, max_length=1024)  
print("Generated response:", response)

Generated response: Can students view unpublished assignments?
Yes. Students can view unpublished assignments on the Assignments page. Learn more about publishing assignments.
Unpublished assignments are identified by the Unpublished icon. You can view the assignment name, but students cannot view the assignment.
Learn more about publishing assignments. 
Review Assignment Details
Students can view details about an assignment. When an assignment is created, the assignment details display. 
Depending on the assignment submission type, students may be able to view various options for the assignment, including the assignment description.
Students may also be able to view details about the assignment using the Edit button.
View Feedback
If your assignment has been affected by a feedback from your instructor, students may be able to view that feedback in their Grades page.
You can view the feedback in the Grades page by clicking the View Feedback link. How do I archive a grading scheme in an