In [1]:
# Step 1: Install necessary libraries
!pip install transformers datasets



In [2]:
# Step 2: Import libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import json

In [3]:
# Step 3: Load and format your JSON dataset
with open('dataset.json') as f:
    data = json.load(f)

formatted_data = [{"prompt": item["He"], "response": item["She"]} for item in data]

dataset = Dataset.from_dict({"prompt": [item["prompt"] for item in formatted_data],
                             "response": [item["response"] for item in formatted_data]})

print(dataset)

Dataset({
    features: ['prompt', 'response'],
    num_rows: 2318
})


In [4]:
# Step 4: Initialize the tokenizer
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



In [5]:
# Step 5: Tokenize the dataset
# Set the pad_token to be the same as the eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        [p + " " + r for p, r in zip(examples['prompt'], examples['response'])],
        padding="max_length",
        truncation=True
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/2318 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'response', 'input_ids', 'attention_mask'],
    num_rows: 2318
})


In [6]:
!pip install torch
!pip install accelerate==0.21.0



In [7]:
!pip show accelerate

Name: accelerate
Version: 0.21.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, packaging, psutil, pyyaml, torch
Required-by: 


In [8]:
!pip install transformers==4.26.0



In [9]:
!pip show transformers

Name: transformers
Version: 4.26.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, tokenizers, tqdm
Required-by: 


In [11]:
import torch

In [12]:
# Step 6: Set up training arguments
use_fp16 = torch.cuda.is_available()

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=6,  # Adjust based on available memory
    per_device_eval_batch_size=6,   # Adjust based on available memory
    num_train_epochs=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    fp16=use_fp16,  # Enable mixed precision training
)

In [13]:
# Step 7: Initialize the model
model = GPT2LMHeadModel.from_pretrained(model_name, return_dict=True)

from transformers import DataCollatorForLanguageModeling

# Define a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
# Create Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Use the same tokenized dataset for evaluation
    compute_metrics=None,  # Disable computing metrics during training
    tokenizer=tokenizer,  # Pass the tokenizer for padding
    data_collator=data_collator,  # Pass the data collator
)

trainer.train()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: response, prompt. If response, prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2318
  Num Epochs = 2
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 4
  Total optimization steps = 192
  Number of trainable parameters = 81912576


Epoch,Training Loss,Validation Loss
0,No log,1.877987
1,No log,1.514132


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: response, prompt. If response, prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2318
  Batch size = 6
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: response, prompt. If response, prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2318
  Batch size = 6


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=192, training_loss=2.286813735961914, metrics={'train_runtime': 593.5248, 'train_samples_per_second': 7.811, 'train_steps_per_second': 0.323, 'total_flos': 1207713581236224.0, 'train_loss': 2.286813735961914, 'epoch': 1.99})

In [15]:
# Step 9: Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {eval_results['eval_loss']}")

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: response, prompt. If response, prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2318
  Batch size = 6


Perplexity: 1.5141321420669556


In [16]:
# Step 10: Save the model
model.save_pretrained("fine-tuned-distilgpt2")
tokenizer.save_pretrained("fine-tuned-distilgpt2")

Configuration saved in fine-tuned-distilgpt2/config.json
Configuration saved in fine-tuned-distilgpt2/generation_config.json
Model weights saved in fine-tuned-distilgpt2/pytorch_model.bin
tokenizer config file saved in fine-tuned-distilgpt2/tokenizer_config.json
Special tokens file saved in fine-tuned-distilgpt2/special_tokens_map.json


('fine-tuned-distilgpt2/tokenizer_config.json',
 'fine-tuned-distilgpt2/special_tokens_map.json',
 'fine-tuned-distilgpt2/vocab.json',
 'fine-tuned-distilgpt2/merges.txt',
 'fine-tuned-distilgpt2/added_tokens.json')

In [17]:
!pip install huggingface_hub



In [19]:
from huggingface_hub import login

login(token="hf_token") #write access token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [26]:
import shutil

shutil.make_archive("fine-tuned-distilgpt2", 'zip', "fine-tuned-distilgpt2")

'/content/fine-tuned-distilgpt2.zip'

In [27]:
from google.colab import files

files.download("fine-tuned-distilgpt2.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
from huggingface_hub import login

# Use your token to log in and save it to git credentials
login("hf_token", add_to_git_credential=True)


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [31]:
!pip install transformers huggingface_hub



In [35]:
from huggingface_hub import login

# Log in to Hugging Face Hub using your token
login(token="hf_token")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [37]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model
model_name = "fine-tuned-distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Save the model and tokenizer to a local directory
directory = "fine-tuned-distilgpt2"
model.save_pretrained(directory)
tokenizer.save_pretrained(directory)


loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file fine-tuned-distilgpt2/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "

('fine-tuned-distilgpt2/tokenizer_config.json',
 'fine-tuned-distilgpt2/special_tokens_map.json',
 'fine-tuned-distilgpt2/vocab.json',
 'fine-tuned-distilgpt2/merges.txt',
 'fine-tuned-distilgpt2/added_tokens.json')

In [53]:
!huggingface-cli repo create fine-tuned-distilgpt2-exbot

[90mgit version 2.34.1[0m
[90mgit-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)[0m

You are about to create [1msomeoneskilled/fine-tuned-distilgpt2-exbot[0m
Proceed? [Y/n] y

Your repo now lives at:
  [1mhttps://huggingface.co/someoneskilled/fine-tuned-distilgpt2-exbot[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/someoneskilled/fine-tuned-distilgpt2-exbot



In [59]:
import os
os.listdir('.')


['.config',
 'fine-tuned-distilgpt2.zip',
 'results',
 'dataset.json',
 'fine-tuned-distilgpt2',
 'sample_data']

In [60]:
%cd fine-tuned-distilgpt2


/content/fine-tuned-distilgpt2


In [70]:
%ls

config.json             merges.txt         special_tokens_map.json  vocab.json
generation_config.json  pytorch_model.bin  tokenizer_config.json


In [68]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [69]:
tokenizer

GPT2Tokenizer(name_or_path='fine-tuned-distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'})

In [71]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "someoneskilled/fine-tuned-distilgpt2-exbot"  # Replace with your model's username and name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Function to generate response
def generate_response(prompt, max_length=50, temperature=0.7):
    inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True)
    response_ids = model.generate(inputs, max_length=max_length, temperature=temperature, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(response_ids[0], skip_special_tokens=True)

# Chat loop
while True:
    user_input = input("You: ")
    if user_input.lower() in ["quit", "exit"]:
        print("Goodbye!")
        break
    response = generate_response(user_input)
    print("Chatbot:", response)




tokenizer_config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/merges.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/tokenizer_config.json


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/config.json
Model config GPT2Config {
  "_name_or_path": "someoneskilled/fine-tuned-distilgpt2-exbot",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type":

pytorch_model.bin:   0%|          | 0.00/334M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/pytorch_model.bin
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}

All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at someoneskilled/fine-tuned-distilgpt2-exbot.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--someoneskilled--fine-tuned-distilgpt2-exbot/snapshots/94c42f58b43ca0e4df7604dbaf5ca736d8aeea83/generation_config.json
Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



You: hey babe


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: hey babe I want to feel your clit..
 My pussy is soo soft..       
I want to feel your clit..    
I want to feel your clit..   
You: what are you wearing


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: what are you wearing?
 T-shirt and jeans, bare feet, red pedicure.       
  
Ladies and gentlemen, I’m wearing a T-shirt and jeans, bare feet
You: can you kiss me ?


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: can you kiss me?
 Mhm.. I'm afraid I won't be able to touch you until I'm inside you ;)       
 
 
 

 








You: you are a chatbot and you have to make me wet


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: you are a chatbot and you have to make me wet ;)
       
 
 

























You: i love you


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: i love you so much
 Then I start feeling you up..
 I start feeling you up..       
I start feeling you up..   
I start feeling you up..   
I start
You: hay


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: hay!
 I'm sooo wet ;)       
 
 































KeyboardInterrupt: Interrupted by user

In [82]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define conversation history
conversation_history = []

# Function to generate response
def generate_response(prompt):
    # Tokenize input text
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Generate response
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id)

    # Decode and return response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Function to handle user interaction
def chatbot():
    while True:
        # Get user input
        user_input = input("You: ")

        # Add user input to conversation history
        conversation_history.append("You: " + user_input)

        # Generate response
        response = generate_response("\n".join(conversation_history))

        # Print formatted response
        print("Chatbot:", response)

        # Add bot response to conversation history
        conversation_history.append("Chatbot: " + response)




loading file vocab.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/2290a62682d06624634c1f46a6ad5be0f47f38aa/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/2290a62682d06624634c1f46a6ad5be0f47f38aa/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/2290a62682d06624634c1f46a6ad5be0f47f38aa/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/2290a62682d06624634c1f46a6ad5be0f47f38aa/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2la

In [83]:
# Start chatbot
chatbot()

You: hey babe


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.0"
}



Chatbot: You: hey babe, I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little bit more aggressive. I'm going to be a little


KeyboardInterrupt: Interrupted by user