# Training

In [None]:
# ! pip install -U accelerate
! pip install -U transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.34.1', '0.24.0')

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import pandas as pd

# Load the pre-trained model and tokenizer
model_name = "microsoft/DialoGPT-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the pad_token to be the EOS token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Load your text dataset

with open('human_chat.txt', 'r') as file:
    dataset = file.readlines()
print(dataset)


['Human 1: Hi!\n', 'Human 2: What is your favorite holiday?\n', 'Human 1: one where I get to meet lots of different people.\n', 'Human 2: What was the most number of people you have ever met during a holiday?\n', 'Human 1: Hard to keep a count. Maybe 25.\n', 'Human 2: Which holiday was that?\n', 'Human 1: I think it was Australia\n', 'Human 2: Do you still talk to the people you met?\n', "Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them\n", 'Human 2: Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?\n', 'Human 1: what do you mean?\n', 'Human 2: I think it\'s like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in 

In [None]:
# Create user-message and model-response pairs
conversations = []
user_message = ""
for line in dataset:
    line = line.strip()
    if user_message:
        conversations.append((user_message, line))
        user_message = ""
    else:
        user_message = line

print(conversations)

[('Human 1: Hi!', 'Human 2: What is your favorite holiday?'), ('Human 1: one where I get to meet lots of different people.', 'Human 2: What was the most number of people you have ever met during a holiday?'), ('Human 1: Hard to keep a count. Maybe 25.', 'Human 2: Which holiday was that?'), ('Human 1: I think it was Australia', 'Human 2: Do you still talk to the people you met?'), ("Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them", 'Human 2: Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?'), ('Human 1: what do you mean?', 'Human 2: I think it\'s like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in my back. I 

In [None]:
print(tokenizer.sep_token)

None


In [None]:
# Tokenize and preprocess the dataset
def tokenize_and_preprocess(examples):
    text = examples[0][9:] + tokenizer.sep_token + examples[1][9:]
    inputs = tokenizer(text,padding="max_length", max_length=200, truncation=True)
    # inputs = tokenizer(examples[0][9:], tokenizer.eos_token + examples[1][9:],padding="max_length", max_length=200, truncation=True)
    return inputs

tokenized_datasets = list(map(tokenize_and_preprocess, conversations))

tokenized_datasets


In [None]:
# Define training arguments
training_args = TrainingArguments(
    num_train_epochs=3,  # Adjust the number of training epochs
    per_device_train_batch_size=3,  # Adjust batch size
    save_steps=5,
    evaluation_strategy="no",
    eval_steps=5,
    output_dir="./fine-tuned-model",
    label_names=["label1", "label2"]
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-model")
# tokenizer.save_pretrained("./fine-tuned-model")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
import pandas as pd

# Load the pre-trained model and tokenizer
model_name = "microsoft/DialoGPT-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the pad_token to be the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Load your text dataset
# Adjust the file path to your text dataset
with open('human_chat.txt', 'r') as file:
    dataset = file.readlines()

# Create user-message and model-response pairs
conversations = []
user_message = ""
for line in dataset:
    line = line.strip()
    if user_message:
        conversations.append((user_message, line))
        user_message = ""
    else:
        user_message = line

# Tokenize and preprocess the dataset
tokenized_conversations = tokenizer(
    [f"{user}: {model}" for user, model in conversations],
    padding=True,
    truncation=True,
    return_tensors="pt",
)

# Define training parameters
learning_rate = 3e-5
num_train_epochs = 3
batch_size = 4

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Fine-tuning loop
for epoch in range(num_train_epochs):
    for i in range(0, len(tokenized_conversations.input_ids), batch_size):
        input_ids = tokenized_conversations.input_ids[i:i+batch_size]
        attention_mask = tokenized_conversations.attention_mask[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids,  # Use input as target (autoregressive training)
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}, Batch {i//batch_size+1}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-model")
# tokenizer.save_pretrained("./fine-tuned-model")


Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Epoch 1, Batch 1, Loss: 6.8820576667785645
Epoch 1, Batch 2, Loss: 4.390382766723633
Epoch 1, Batch 3, Loss: 1.877872347831726
Epoch 1, Batch 4, Loss: 1.6741822957992554
Epoch 1, Batch 5, Loss: 1.5908619165420532
Epoch 1, Batch 6, Loss: 1.48786199092865
Epoch 1, Batch 7, Loss: 1.5854464769363403
Epoch 1, Batch 8, Loss: 1.3574233055114746
Epoch 1, Batch 9, Loss: 1.2592653036117554
Epoch 1, Batch 10, Loss: 1.4250916242599487
Epoch 1, Batch 11, Loss: 0.9503046274185181
Epoch 1, Batch 12, Loss: 0.9130662679672241
Epoch 1, Batch 13, Loss: 0.7253589034080505
Epoch 1, Batch 14, Loss: 1.502406120300293
Epoch 1, Batch 15, Loss: 0.952154815196991
Epoch 1, Batch 16, Loss: 1.0331279039382935
Epoch 1, Batch 17, Loss: 1.1424572467803955
Epoch 1, Batch 18, Loss: 1.7479043006896973
Epoch 1, Batch 19, Loss: 2.0676136016845703
Epoch 1, Batch 20, Loss: 0.6097062230110168
Epoch 1, Batch 21, Loss: 0.7241286635398865
Epoch 1, Batch 22, Loss: 0.8887230157852173
Epoch 1, Batch 23, Loss: 1.4286171197891235
Epo

# Upload model to Hugging face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub('NEMO_AI_Compainon_v0.2')
tokenizer.push_to_hub('NEMO_AI_Compainon_v0.2')

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pyro-glitch/NEMO_AI_Compainon_v0.2/commit/381e2742cbc0a88423480e72a340304314b658f1', commit_message='Upload tokenizer', commit_description='', oid='381e2742cbc0a88423480e72a340304314b658f1', pr_url=None, pr_revision=None, pr_num=None)

# Testing

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
model = AutoModelForCausalLM.from_pretrained('pyro-glitch/NEMO-DialoGPT-medium-model')

Downloading (…)lve/main/config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [None]:
for step in range(10):

  #input and tokenize
  prompt=input()
  prompt_input_ids=tokenizer.encode(prompt+tokenizer.eos_token, return_tensors='pt')
  prompt_input_ids

  # add to history
  if(step > 0):
    input_ids = torch.cat([response, prompt_input_ids], dim=-1)
  else:
      input_ids = prompt_input_ids

  #generate response
  response = model.generate(
      input_ids,
      max_length=70,
      pad_token_id=tokenizer.eos_token_id,
      early_stopping=True)
  # response

  #decode response
  output=tokenizer.decode(response[0])
  print(output)

hi, how are you?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


hi, how are you?<|endoftext|>I'm good, how are you?<|endoftext|>
I'm am fine.


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


I'm am fine.<|endoftext|>I'm glad to hear that.<|endoftext|>
what is the best place for a vacation.


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


what is the best place for a vacation.<|endoftext|>I would say the Caribbean is a good place to vacation.<|endoftext|>
have you ever been to Caribbean


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


have you ever been to Caribbean<|endoftext|>I have not. I'm not a big fan of the Caribbean.<|endoftext|>
have you ever been to Caribbean<|endoftext|>I have not. I'm not a big fan of the Caribbean.<|endoftext|> where would you like to go on a vacation then?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


have you ever been to Caribbean<|endoftext|>I have not. I'm not a big fan of the Caribbean.<|endoftext|> where would you like to go on a vacation then?<|endoftext|>I would love to go to Jamaica or somewhere tropical.<|endoftext|>
what is your name?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


what is your name?<|endoftext|>I'm not sure, but I think it's a reference to the movie The Big Lebowski<|endoftext|>
can you tell me a joke?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


can you tell me a joke?<|endoftext|>What's the most embarrassing moment you've ever seen?<|endoftext|>
can you tell me a joke?<|endoftext|>What's the most embarrassing moment you've ever seen?<|endoftext|> that does not sound like a joke


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


can you tell me a joke?<|endoftext|>What's the most embarrassing moment you've ever seen?<|endoftext|> that does not sound like a joke<|endoftext|>It's a very good joke<|endoftext|>
can you tell me a joke?<|endoftext|>What's the most embarrassing moment you've ever seen?<|endoftext|> that does not sound like a joke<|endoftext|>It's a very good joke<|endoftext|>what is it?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


can you tell me a joke?<|endoftext|>What's the most embarrassing moment you've ever seen?<|endoftext|> that does not sound like a joke<|endoftext|>It's a very good joke<|endoftext|>what is it?<|endoftext|>A joke<|endoftext|>
can you play music?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


can you play music?<|endoftext|>I can play music, but I can't play the game.<|endoftext|>
