In [None]:
import pandas as pd 
import tensorflow as tf 
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader


In [None]:
df = pd.read_csv('./data/prepocessed.csv', delimiter=',',
                 engine='python', error_bad_lines=False, nrows=3000)

print(df.info())

drop_cols = ['overview', 'sectionLabel', 'title']
df = df.drop(drop_cols, axis=1)
df = df.dropna()

df.rename(columns={"headline":"summary", "text": "article"}, inplace= True)
print(df.info())



In [None]:
# Train, Test, Val split (60, 20, 20)
train_data = df.sample(frac=0.60)  # 60%
rest_part_40 = df.drop(train_data.index)
test_data = rest_part_40.sample(frac=0.50)  # 20%
validation_data = rest_part_40.drop(test_data.index)  # 20%
print("Shapes: ", train_data.shape, validation_data.shape, test_data.shape)

train_text = train_data["article"] + "\t" + train_data["summary"] + "\n"
val_text = validation_data["article"] + "\t" + validation_data["summary"] + "\n"

with open("./data/train_gpt2.txt", "w") as f_t, open("./data/val_gpt2.txt", "w") as f_v:
    f_t.writelines("text" + "\t" "summary" + "\n")
    f_v.writelines("text" + "\t" "summary" + "\n")
    f_t.writelines(train_text.tolist())
    f_v.writelines(val_text.tolist())



In [None]:
class WikiHowDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = pd.read_csv(file_path, sep="\t")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        summary = self.data.iloc[idx]['summary']
        
        # Tokenize text and summary
        input_ids = self.tokenizer.encode(text, summary, add_special_tokens=True, max_length=self.max_length, truncation=True)
        
        # Pad tokenized inputs to max length
        padding_length = self.max_length - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        
        attention_mask = [1] * len(input_ids)

        return {'input_ids': input_ids, 'attention_mask': attention_mask}


In [None]:
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

# Define the training and validation datasets
train_dataset = WikiHowDataset("./data/train_gpt2.txt", tokenizer, max_length=56)
val_dataset = WikiHowDataset("./data/val_gpt2.txt", tokenizer, max_length=56)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=100,
    save_total_limit=2,
    save_steps=500,
    logging_steps=100,
    learning_rate=2e-5,
    overwrite_output_dir=True,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()
# Save the trained model
model.save_pretrained('./models/gpt2')

In [None]:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token
# max_length = 512

# def create_training_example(article, summary):
#     if isinstance(article, tf.Tensor):
#         article = article.numpy().decode("utf-8")
#     if isinstance(summary, tf.Tensor):
#         summary = summary.numpy().decode("utf-8")
#     input_ids = tokenizer.encode(article, truncation=True, padding='longest', max_length=512, return_tensors='tf')[0]
#     target_ids = tokenizer.encode(summary, truncation=True, padding='longest', max_length=128, return_tensors='tf')[0]
#     return input_ids, target_ids

# # Create the training dataset
# train_dataset = tf.data.Dataset.from_tensor_slices(
#     (df["article"], df["summary"])
# ).map(create_training_example, num_parallel_calls=tf.data.AUTOTUNE).batch(8)




In [None]:
# model = TFGPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

# # Define the training loop
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# @tf.function
# def train_step(inputs, labels):
#     with tf.GradientTape() as tape:
#         logits = model(inputs, training=True).logits
#         loss = loss_fn(labels, logits)
#     grads = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(grads, model.trainable_variables))
#     return loss

# # Fine-tuning the model on the training dataset
# num_epochs = 3
# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}")
#     total_loss = 0.0
#     for batch in train_dataset_dict:
#         inputs = {k: batch[k] for k in batch if k != 'labels'}
#         labels = batch['labels']
#         loss = train_step(inputs, labels)
#         total_loss += loss
#     print(f"Loss: {total_loss}")
    
# # Save the fine-tuned model
# model.save_pretrained("./models/fine_tuned_gpt2_model")