In [1]:
# Install necessary libraries (if not already installed)
!pip install pandas transformers datasets torch

# Import libraries
import pandas as pd
from datasets import Dataset


Defaulting to user installation because normal site-packages is not writeable


In [3]:
# Load data into pandas DataFrames
train_data = pd.read_csv('samsum-train.csv')
test_data = pd.read_csv('samsum-test.csv')
validation_data = pd.read_csv('samsum-validation.csv')

# Display a few rows from each dataset
print("Train Data Sample:")
print(train_data.head())

print("\nTest Data Sample:")
print(test_data.head())

print("\nValidation Data Sample:")
print(validation_data.head())


Train Data Sample:
         id                                           dialogue  \
0  13818513  Amanda: I baked  cookies. Do you want some?\r\...   
1  13728867  Olivia: Who are you voting for in this electio...   
2  13681000  Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...   
3  13730747  Edward: Rachel, I think I'm in ove with Bella....   
4  13728094  Sam: hey  overheard rick say something\r\nSam:...   

                                             summary  
0  Amanda baked cookies and will bring Jerry some...  
1  Olivia and Olivier are voting for liberals in ...  
2  Kim may try the pomodoro technique recommended...  
3  Edward thinks he is in love with Bella. Rachel...  
4  Sam is confused, because he overheard Rick com...  

Test Data Sample:
         id                                           dialogue  \
0  13862856  Hannah: Hey, do you have Betty's number?\nAman...   
1  13729565  Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...   
2  13680171  Lenny: Babe, can you hel

In [5]:
# Convert pandas DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
validation_dataset = Dataset.from_pandas(validation_data)

# Display a few rows from the Hugging Face Dataset
print("\nHugging Face Train Dataset Sample:")
print(train_dataset[:5])



Hugging Face Train Dataset Sample:
{'id': ['13818513', '13728867', '13681000', '13730747', '13728094'], 'dialogue': ["Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great', "Tim: Hi, what's up?\r\nKim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating\r\nTim: What did you plan on doing?\r\nKim: Oh you know, uni stuff and unfucking my room\r\nKim: Maybe tomorrow I'll move my ass and do everything\r\nKim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies\r\nTim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores\r\nTim: It really helps\r\nKim: thanks, maybe I'll do that\r\nTim: I also like using post-its in kaban style", "Edward: Rachel, I think I'm in ove with Bella..\r\nrachel: Dont say anything else..\r\nEdward: What do you m

In [7]:
# Verify column names
print("Train Columns:", train_data.columns)
print("Test Columns:", test_data.columns)
print("Validation Columns:", validation_data.columns)


Train Columns: Index(['id', 'dialogue', 'summary'], dtype='object')
Test Columns: Index(['id', 'dialogue', 'summary'], dtype='object')
Validation Columns: Index(['id', 'dialogue', 'summary'], dtype='object')


In [9]:
# Filter out invalid rows where `dialogue` is not a string or is empty
def filter_invalid_examples(example):
    return isinstance(example["dialogue"], str) and len(example["dialogue"].strip()) > 0

# Apply filter to datasets
train_dataset = train_dataset.filter(filter_invalid_examples)
test_dataset = test_dataset.filter(filter_invalid_examples)
validation_dataset = validation_dataset.filter(filter_invalid_examples)


Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

In [11]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["dialogue"],  # Input text
        padding="max_length",  # Pad to max length
        truncation=True,       # Truncate to max length
        max_length=128,        # Max sequence length for BERT
        return_tensors="pt"    # Return PyTorch tensors
    )

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

# Display a sample tokenized entry
print("\nSample Tokenized Train Entry:")
print(tokenized_train[0])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]


Sample Tokenized Train Entry:
{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.', 'input_ids': [101, 8282, 1024, 1045, 17776, 16324, 1012, 2079, 2017, 2215, 2070, 1029, 6128, 1024, 2469, 999, 8282, 1024, 1045, 1005, 2222, 3288, 2017, 4826, 1024, 1011, 1007, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load and preprocess the datasets
train_data = pd.read_csv('samsum-train.csv')
test_data = pd.read_csv('samsum-test.csv')
validation_data = pd.read_csv('samsum-validation.csv')

# Format data for GPT-2 fine-tuning
def format_data(df):
    return df.apply(lambda x: f"Dialogue: {x['dialogue']}\nSummary: {x['summary']}\n", axis=1)

# Prepare train, test, and validation text files
train_texts = format_data(train_data)
test_texts = format_data(test_data)
validation_texts = format_data(validation_data)

with open('train.txt', 'w') as f:
    f.writelines(train_texts)
with open('test.txt', 'w') as f:
    f.writelines(test_texts)
with open('validation.txt', 'w') as f:
    f.writelines(validation_texts)

# Load datasets
dataset = load_dataset('text', data_files={
    'train': 'train.txt',
    'test': 'test.txt',
    'validation': 'validation.txt'
})

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Fix the padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as the padding token

# Tokenize dataset and add labels
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # Add labels for loss calculation
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-fine-tuned",
    eval_strategy="epoch",  # Updated to `eval_strategy`
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Set to "wandb" or others if needed
    push_to_hub=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the final model
trainer.save_model("./gpt2-fine-tuned")
tokenizer.save_pretrained("./gpt2-fine-tuned")

# Evaluate the model
results = trainer.evaluate(tokenized_datasets["test"])
print("Evaluation Results:", results)

# Use the fine-tuned model for inference
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-fine-tuned", tokenizer="./gpt2-fine-tuned")

input_text = "Dialogue: Hey, do you know where my keys are?"
output = generator(input_text, max_length=100, num_return_sequences=1)
print("Generated Text:")
print(output[0]["generated_text"])


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.