In [17]:
import pandas as pd
from datasets import Dataset


In [18]:
data = pd.read_csv('1_10_seasons_tbbt.csv')

In [19]:
data.sample(6)

Unnamed: 0,episode_name,dialogue,person_scene
27372,Series 06 Episode 06 – The Extract Obliteration,I don’t want you to make a big deal out of it.,Penny
12435,Series 03 Episode 12 – The Psychic Vortex,"Oh, yes. I just discovered I don’t have enoug...",Sheldon
37454,Series 08 Episode 02 – The Junior Professor So...,"Oh, it’s no trouble, it’s actually a pleasure.",Sheldon
10347,Series 03 Episode 03 – The Gothowitz Deviation,Why are you carrying extras?,Leonard
51379,Series 10 Episode 12 – The Holiday Summation,Do you need help?,Amy
23101,Series 05 Episode 10 – The Flaming Spittoon Ac...,"No, I’m having a nice time.",Amy


In [20]:
CHARACTER_NAME = 'Sheldon'

In [21]:
sheldon_df = data[data['person_scene'] == 'Sheldon']

In [22]:
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(sheldon_df)

In [23]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function to tokenize the dialogues
def tokenize_function(examples):
    return tokenizer(examples['dialogue'], padding='max_length', truncation=True, max_length=128)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True);

Map: 100%|██████████████████████████████████████████████████████████████| 11484/11484 [00:05<00:00, 2044.06 examples/s]


In [24]:
from transformers import DataCollatorForLanguageModeling, BertForMaskedLM, Trainer, TrainingArguments

# Load the model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
import torch

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:


# Define a data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets
)


In [None]:
# Train the model
trainer.train()



Step,Training Loss


In [None]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results2')

In [18]:

def generate_response(input_text, model, tokenizer, max_length=1000):
    # Encode the input text
    inputs = tokenizer.encode_plus(input_text, return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate response using the model
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id
        )

    # Decode the generated tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response


In [None]:
def chat_with_sheldon(model, tokenizer):
    print("Chatbot: Hello! I am Sheldon. How can I assist you today?")

    conversation_history = ""

    while True:
        user_input = input("You: ")

        if user_input.lower() in ['exit', 'quit']:
            print("Chatbot: Goodbye!")
            break

        # Append user input to the conversation history
        conversation_history += f"User: {user_input} "

        # Generate a response
        response = generate_response(conversation_history, model, tokenizer)

        # Append the bot's response to the conversation history
        conversation_history += f"Sheldon: {response} "

        print(f"Chatbot: {response}")

# Start the chat
chat_with_sheldon(model, tokenizer)


In [34]:
chat_with_sheldon(model, tokenizer)


Chatbot: Hello! I am Sheldon. How can I assist you today?
You: heyy


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


TypeError: '>' not supported between instances of 'int' and 'BertForMaskedLM'

In [37]:
# Let's chat for 4 lines
for step in range(4):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    # inputs = tokenizer.encode_plus(input_text, return_tensors='pt')
    new_user_input_ids = tokenizer.encode(input(">> User:"),
    #+ tokenizer.eos_token
                                          return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature=0.8
    )

    # pretty print last ouput tokens from bot
    print("JoshuaBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:hello
JoshuaBot: well. well, now, now. now, oh, uh, oh. uh,, oh! uh, uh. oh,, uh! oh, oh uh, um, uhh, uh - uh, huh - uhh - uh - oh, um - uh uh uhh uh uh, and uh, eh, uh uh - huh, uhhh uh uhhhhh, uh huh uh uh huh - oh uhhhh, ohh uhhhh uhhh huh uhhh, ohhhhhhh uhohhhhhhhhhhohhhh uhhhohohhhoh uhhhahhhhh - uhhhuhhhhhuhh uhoh uhhhhhhuhhh uhhuhhhhohh, um uhhh ehhhhh ohhhhh - ohhhohuhhhhohhh uhahhhhahhhohahhh uhuhhhahhhhuhohhhahahhhahohhh,hhhh


KeyboardInterrupt: Interrupted by user

In [38]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def generate_response(prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=10.2):
    # Encode the prompt
    model_name = "microsoft/DialoGPT-small"  # You can choose other models as well

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate response with adjusted decoding parameters
    response_ids = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        # pad_token_id=tokenizer.eos_token_id,
        # eos_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
        # stop_token=tokenizer.eos_token_id
    )

    # Decode the generated response
    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)

    # Post-process the response to remove the prompt and clean up the output
    response = response.replace(prompt, "").strip()

    return response

# Example usage
prompt = "Hello Sheldon, how are you today?"
response = generate_response(prompt)
print("Sheldon:", response)



tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sheldon: 


In [39]:
generate_response("Yes, what are you doing?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


''

In [40]:
generate_response("Come on you can tell me.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'what I said.'

In [41]:
generate_response("")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index -1 is out of bounds for dimension 1 with size 0