In [17]:
import pandas as pd
from datasets import Dataset


In [18]:
data = pd.read_csv('1_10_seasons_tbbt.csv')

In [19]:
data.sample(6)

Unnamed: 0,episode_name,dialogue,person_scene
27372,Series 06 Episode 06 – The Extract Obliteration,I don’t want you to make a big deal out of it.,Penny
12435,Series 03 Episode 12 – The Psychic Vortex,"Oh, yes. I just discovered I don’t have enoug...",Sheldon
37454,Series 08 Episode 02 – The Junior Professor So...,"Oh, it’s no trouble, it’s actually a pleasure.",Sheldon
10347,Series 03 Episode 03 – The Gothowitz Deviation,Why are you carrying extras?,Leonard
51379,Series 10 Episode 12 – The Holiday Summation,Do you need help?,Amy
23101,Series 05 Episode 10 – The Flaming Spittoon Ac...,"No, I’m having a nice time.",Amy


In [20]:
CHARACTER_NAME = 'Sheldon'

In [21]:
sheldon_df = data[data['person_scene'] == 'Sheldon']

In [22]:
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(sheldon_df)

In [23]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function to tokenize the dialogues
def tokenize_function(examples):
    return tokenizer(examples['dialogue'], padding='max_length', truncation=True, max_length=128)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True);

Map: 100%|██████████████████████████████████████████████████████████████| 11484/11484 [00:05<00:00, 2044.06 examples/s]


In [24]:
from transformers import DataCollatorForLanguageModeling, BertForMaskedLM, Trainer, TrainingArguments

# Load the model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
import torch

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:


# Define a data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets
)


In [26]:
# Train the model
trainer.train()



Step,Training Loss
500,2.3317
1000,2.1439
1500,2.1335
2000,1.9693
2500,1.9637
3000,1.911
3500,1.8351
4000,1.836


TrainOutput(global_step=4308, training_loss=2.000197131653682, metrics={'train_runtime': 45736.9555, 'train_samples_per_second': 0.753, 'train_steps_per_second': 0.094, 'total_flos': 2266983052646400.0, 'train_loss': 2.000197131653682, 'epoch': 3.0})

In [28]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results2')

('./results2\\tokenizer_config.json',
 './results2\\special_tokens_map.json',
 './results2\\vocab.txt',
 './results2\\added_tokens.json')

In [35]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def generate_response(prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=10.2):
    # Encode the prompt
    # model_name = "microsoft/DialoGPT-small"  # You can choose other models as well

    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    # model = AutoModelForCausalLM.from_pretrained(model_name)

    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate response with adjusted decoding parameters
    response_ids = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        # pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
        # stop_token=tokenizer.eos_token_id
    )

    # Decode the generated response
    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)

    # Post-process the response to remove the prompt and clean up the output
    response = response.replace(prompt, "").strip()

    return response

prompt = "Hello Sheldon, how are you today?"
response = generate_response(prompt)
print("Sheldon:", response)

Sheldon: hello sheldon, how are you today?. ) hi! ( bye - night … knock and then i hear : hey there now right here is what happens to me ’ s boyfriends tonight again so goody a we anotherrs they say


In [36]:
generate_response("Yes, what are you doing?")

'yes, what are you doing? oh. hello! hi leonard - bye good and very well … done that has been all right so much for me because it ’ s late ) ( : hey penny here there is a getting just get have'

In [33]:
generate_response("Come on you can tell me.")

'come on you can tell me. i ’ ll be right back, wait here and watch the movie while we make it happen so they know that something happened to us …?! now what ‘ s happening is this thing again in between them'

In [34]:
generate_response("")

'. …, now go home again and you can ’ t longer be any more annoying or irritating than me tonight! okay? fine then - well all we do is talk together here right in the morning of being caught sneaking anything by leonard cooper'