<a href="https://colab.research.google.com/github/seungjun-green/articleGeneratorModel/blob/main/fine-tune_t5_for_articleGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **1. install libraries and packages**

In [None]:
!pip install tensorflow-datasets
!pip install datasets
!pip install transformers
!pip install rouge_score

### **2. load the dataset and convert it to DatasetDict format**

In [None]:
import json

with open('wiki_total.json', 'r') as f:
    data = json.load(f)


In [None]:
import csv
csv_file_path = 'wiki_total.csv'

# Write the CSV file
with open(csv_file_path, mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['id', 'prompt', 'text'])
    writer.writeheader()

    for curr in data:
      row = {'id': curr["id"], 'prompt': curr["title"], 'text': curr["text"]}
      writer.writerow(row)

In [None]:
# scrip used to combine tow csv files, if you have wiki_total.csv you can ignore this #

import csv
csv.field_size_limit(1048576) 
# Define the file paths for the two CSV files
csv_file_path1 = 'wiki_01.csv'
csv_file_path2 = 'wiki_02.csv'

# Define the file path for the combined CSV file
combined_csv_file_path = 'wiki_total.csv'

# Read in the data from the first CSV file
data1 = []
with open(csv_file_path1, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        data1.append(row)

# Read in the data from the second CSV file
data2 = []
with open(csv_file_path2, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        data2.append(row)

# Combine the data from the two CSV files
combined_data = data1 + data2

# Write the combined data to a new CSV file
with open(combined_csv_file_path, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=combined_data[0].keys())
    writer.writeheader()
    for row in combined_data:
        writer.writerow(row)


In [None]:
# Start from here if you imported cleaned dataset

import pandas as pd
import numpy as np

# Load the csv file into a pandas DataFrame
df = pd.read_csv('wiki_total.csv')

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each split
num_rows = df.shape[0]
train_rows = int(num_rows * 0.8)
val_rows = int(num_rows * 0.1)

# Split the DataFrame into training, validation, and test sets
train_df = df.iloc[:train_rows, :]
val_df = df.iloc[train_rows:train_rows+val_rows, :]
test_df = df.iloc[train_rows+val_rows:, :]

# Save the split data to separate csv files with the header
train_df.to_csv('train.csv', index=False, header=True)
val_df.to_csv('val.csv', index=False, header=True)
test_df.to_csv('test.csv', index=False, header=True)

In [None]:
import datasets
from datasets import load_dataset
import pandas as pd
 
# load the CSV files as Dataset 
raw_datasets = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv', 'validation': 'val.csv'})

In [None]:
print(raw_datasets)

### **3. Pre-process the raw_datasets**

In [None]:
model_checkpoint = "t5-small"

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "Write about"
else:
    prefix = ""

In [None]:
max_input_length = 20
max_target_length = 1024

def preprocess_function(examples):
    inputs = [prefix + doc + ":" for doc in examples["prompt"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["text"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

### **4. Load the model and tokenizer and set arugments**

In [None]:
model_checkpoint = "t5-small"

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
batch_size = 16

model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"textGeneration_01",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length = 1024,
    fp16=True,
    push_to_hub=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import load_metric
metric = load_metric("rouge")

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
trainer.train()