In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/enron-email/enron_cleaned_subject_body_3.csv")
df.head(20)

In [None]:
df['input_text'] = "generate subject: " + df['clean_body']
df['target_text'] = df['clean_subject']

df.shape

In [None]:
df.head(20)

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.head(20)

In [None]:
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


print(train_df.shape, val_df.shape, test_df.shape)

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[['input_text', 'target_text']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[['input_text', 'target_text']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['input_text', 'target_text']].reset_index(drop=True))


In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")



In [None]:
def tokenize(row):
    input_enc = tokenizer(
        row["input_text"],
        padding="max_length",
        truncation=True,
        max_length = 512
    )

    target_enc = tokenizer(
        row["target_text"],
        padding="max_length",
        truncation=True,
        max_length = 64
    )

    labels = [
        (token if token != tokenizer.pad_token_id else -100)
        for token in target_enc["input_ids"]
    ]

    return {
        "input_ids": input_enc["input_ids"],
        "attention_mask": input_enc["attention_mask"],
        "labels": target_enc["input_ids"]
    }

In [None]:
tokenized_train = train_dataset.map(tokenize, batched=False)
tokenized_val = val_dataset.map(tokenize, batched=False)
tokenized_test = test_dataset.map(tokenize, batched=False)


In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
model = T5ForConditionalGeneration.from_pretrained("t5-base") 


In [None]:
!pip install --upgrade transformers


In [None]:
from transformers import TrainingArguments
help(TrainingArguments)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,  # Smaller batch size for t5-base on Kaggle GPU
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./logs',
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)


In [None]:
!pip install -q evaluate  
!pip install rouge_score

In [None]:
from evaluate import load
rouge = load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {key: value.mid.fmeasure for key, value in result.items()}


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
model.save_pretrained("/kaggle/working/t5_base_subject_model")
tokenizer.save_pretrained("/kaggle/working/t5_base_subject_model")
