<a href="https://colab.research.google.com/github/seungjun-green/articleGeneratorModel/blob/main/version02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **1. install libraries and packages**

In [None]:
!pip install tensorflow-datasets
!pip install datasets
!pip install transformers
!pip install rouge_score

### **2. load the dataset and convert it to DatasetDict format**

In [None]:
import csv

# Open the given CSV file for reading
with open('wiki_total.csv', 'r') as given_file:
    reader = csv.reader(given_file)

    # Open a new CSV file for writing
    with open('wiki_total_01.csv', 'w', newline='') as new_file:
        writer = csv.writer(new_file)

        # Write the header row to the new file
        writer.writerow(['id', 'prompt', 'article'])

        # Iterate over each row in the given CSV file
        for row in reader:
            # Check the condition for the row
            article = row[2]
            if len(article) < 5000 and "==" not in article and "|" not in article:
              writer.writerow(row)

In [None]:
import json
import csv
def convert_json_to_csv(file_name):
  with open(f'{file_name}.json', 'r') as f:
    data = json.load(f)
  csv_file_path = f'{file_name}.csv'

  # Write the CSV file
  with open(csv_file_path, mode='w', newline='') as f:
      writer = csv.DictWriter(f, fieldnames=['id', 'prompt', 'text'])
      writer.writeheader()

      for curr in data:
        row = {'id': curr["id"], 'prompt': curr["title"], 'text': curr["text"]}
        if len(row['text'].split()) <= 1024:
          writer.writerow(row)

In [None]:
convert_json_to_csv("wiki_03")

In [None]:
import csv

def combine_csv_files(files):
    csv.field_size_limit(1048576)
    combined_csv_file_path = 'wiki_total.csv'
    combined_data = []  # initialize the combined data list
    
    for file_path in files:
        curr_data = []
        with open(file_path, 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                curr_data.append(row)
        
        combined_data += curr_data
    
    # Write the combined data to a new CSV file
    with open(combined_csv_file_path, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=combined_data[0].keys())
        writer.writeheader()
        for row in combined_data:
            writer.writerow(row)


In [None]:
combine_csv_files(['wiki_01.csv', 'wiki_02.csv', 'wiki_03.csv'])

In [None]:
# Start from here if you imported cleaned dataset

import pandas as pd
import numpy as np

# Load the csv file into a pandas DataFrame
df = pd.read_csv('wiki_total.csv')

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each split
num_rows = df.shape[0]
train_rows = int(num_rows * 0.8)
val_rows = int(num_rows * 0.1)

# Split the DataFrame into training, validation, and test sets
train_df = df.iloc[:train_rows, :]
val_df = df.iloc[train_rows:train_rows+val_rows, :]
test_df = df.iloc[train_rows+val_rows:, :]

# Save the split data to separate csv files with the header
train_df.to_csv('train.csv', index=False, header=True)
val_df.to_csv('val.csv', index=False, header=True)
test_df.to_csv('test.csv', index=False, header=True)

In [None]:
import datasets
from datasets import load_dataset
import pandas as pd
 
# load the CSV files as Dataset 
raw_datasets = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv', 'validation': 'val.csv'})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-29de889c6ebf1364/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-29de889c6ebf1364/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'text'],
        num_rows: 28548
    })
    test: Dataset({
        features: ['id', 'prompt', 'text'],
        num_rows: 3570
    })
    validation: Dataset({
        features: ['id', 'prompt', 'text'],
        num_rows: 3568
    })
})


### **3. Pre-process the raw_datasets**

In [None]:
model_checkpoint = "t5-small"

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "Write an article about"
else:
    prefix = ""

In [None]:
max_input_length = 20
max_target_length = 1024

def preprocess_function(examples):
    inputs = [prefix + doc + ":" for doc in examples["prompt"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["text"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/28548 [00:00<?, ? examples/s]



Map:   0%|          | 0/3570 [00:00<?, ? examples/s]

Map:   0%|          | 0/3568 [00:00<?, ? examples/s]

### **4. Load the model and tokenizer and set arugments**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
batch_size = 32

model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"textGeneration_02",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length = 1024,
    fp16=True,
    push_to_hub=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import load_metric
metric = load_metric("rouge")

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/Seungjun/textGeneration_02 into local empty directory.


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
trainer.train()