<a href="https://colab.research.google.com/github/sl-Zhou/NLP-project/blob/armin/KD_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from google.colab import drive
import os

drive.mount('/content/drive')

!nvidia-smi
!pip install datasets==2.11.0 --quiet
!pip install accelerate  --quiet
!pip install evaluate   --quiet
!pip install rouge_score --quiet
# Data Preparation
import pandas as pd
import numpy as np
import datasets
data = pd.read_csv('./data/Jfleg4-2-4.csv')
data.head()
# Define the output JSONL file name
filename = './data/output.jsonl'

# Iterate through the rows and write each row as a JSON object to the JSONL file
with open(filename, 'w') as jsonl_file:
    for _, row in data.iterrows():
        json_data = row.to_json(orient='columns')
        jsonl_file.write(json_data + '\n')
from transformers import AutoTokenizer
import datasets

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

# Define tokenize function
def tokenize_function(examples):
    # Concatenate text pairs if necessary
    if "input" in examples and "target" in examples:  # Assuming the presence of 'input' and 'target' in your dataset
        text = [inp + tgt for inp, tgt in zip(examples["input"], examples["target"])]
    else:
        text = examples["text"]

    # Tokenize batch of text
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        padding='max_length',  # Will pad to the max length in the batch or max_length if provided
        truncation=True,
        max_length=128,  # You can define max length here
        return_tensors="pt",  # Return PyTorch tensors
    )

    return tokenized_inputs

# Load dataset
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

# Apply tokenize function
tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,  # You can set a larger batch size for efficiency
)

# Add labels
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])


print(tokenized_dataset["input"][0])

print(tokenized_dataset["target"][0])

print(tokenized_dataset["input_ids"][0])

# Train test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(split_dataset)
# Before Training
import datasets
import logging
import random
import logging
import torch
import transformers
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
model_name = "EleutherAI/pythia-70m"
base_model = AutoModelForCausalLM.from_pretrained(model_name)

device_count = torch.cuda.device_count()
if device_count > 0:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

base_model.to(device)
print(device)
test_text = test_dataset[1]['input']
max_input_tokens = 1000
max_output_tokens=100
# Tokenize
input_ids = tokenizer.encode(
      test_text,
      return_tensors="pt",
      truncation=True,
      max_length=max_input_tokens
)

# Generate
device = base_model.device
generated_tokens_with_prompt = base_model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

# Decode
generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

# Strip the prompt
generated_text_answer = generated_text_with_prompt[0][len(test_text):]


print("input:", test_text)
print(f"Correct answer from dataset: {test_dataset[1]['target']}")
print("Model's answer: ")
print(generated_text_answer)

# finetuning
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
# number of epoch
max_steps = -1

# Save model to this direction
trained_model_name = f"pythia_ft_{max_steps}_steps"
output_dir = '/content/drive/MyDrive/project/' + trained_model_name
save_dir = f'{output_dir}/final'

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=3,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=64,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  save_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)


training_output = trainer.train()

trainer.save_model(save_dir)
print("Saved model to:", save_dir)
device_count = torch.cuda.device_count()
if device_count > 0:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
save_dir = '/content/drive/MyDrive/project/pythia_ft_-1_steps/final'
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)
# def generate_output(test_question, model, max_input_length, max_new_tokens):
#     # Tokenize
#     input_ids = tokenizer.encode(
#         test_question,
#         return_tensors="pt",
#         truncation=True,
#         max_length=max_input_length  # Make sure to set a proper max_length
#     )

#     # Generate
#     device = model.device
#     generated_tokens_with_prompt = model.generate(
#         input_ids=input_ids.to(device),
#         max_new_tokens=max_new_tokens  # Set max_new_tokens to control the number of generated tokens
#     )

#     # Decode
#     generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

#     # Strip the prompt (if necessary)
#     if test_question in generated_text_with_prompt[0]:
#         generated_text_answer = generated_text_with_prompt[0].replace(test_question, '')
#     else:
#         generated_text_answer = generated_text_with_prompt[0]
#     return generated_text_answer.strip()

model_name = "EleutherAI/pythia-70m"
base_model = AutoModelForCausalLM.from_pretrained(model_name)
base_model.to(device)
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)
from tqdm import tqdm

max_input_tokens = 1000
max_output_tokens = 400
text_list = []
tuned_predicted_text_list = []
actual_test_list = []
base_predicted_text_list = []


for i in tqdm(range(100), desc="Processing test dataset"):
    test_text = test_dataset[i]['input']
    # Tokenize
    input_ids = tokenizer.encode(
        test_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # Generate
    device = base_model.device
    generated_tokens_with_prompt = base_model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)
    generated_tokens_with_prompt_ft = finetuned_slightly_model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens)

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    generated_text_with_prompt_ft = tokenizer.batch_decode(generated_tokens_with_prompt_ft, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(test_text):]
    generated_text_answer_ft = generated_text_with_prompt_ft[0][len(test_text):]

    # Append results to lists
    actual_test_list.append(test_dataset[i]['target'])
    tuned_predicted_text_list.append(generated_text_answer_ft)
    base_predicted_text_list.append(generated_text_answer)
    text_list.append(test_text)


print("Data processing complete.")

# Create a DataFrame from the lists
data = {
    'input': text_list,
    'correct': actual_test_list,
    'Tuned Prediction': tuned_predicted_text_list,
    'Base Prediction': base_predicted_text_list
}

df = pd.DataFrame(data)

# Write the DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/project/dataset_ft_100.csv', index=False)

import pandas as pd
import evaluate

df = pd.read_csv('/content/drive/MyDrive/project/dataset_ft_100.csv')


base_predicted_text_list = df['Base Prediction'].tolist()
tuned_predicted_text_list = df['Tuned Prediction'].tolist()
actual_test_list = df['correct'].tolist()


bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

print("Base Model Predictions BLEU Results:")
base_bleu_results = bleu.compute(predictions=base_predicted_text_list, references=[[ref] for ref in actual_test_list])
print(base_bleu_results)

print("Fine Tuned Model Predictions BLEU Results:")
tuned_bleu_results = bleu.compute(predictions=tuned_predicted_text_list, references=[[ref] for ref in actual_test_list])
print(tuned_bleu_results)

print("Base Model Predictions ROUGE Results:")
base_rouge_results = rouge.compute(predictions=base_predicted_text_list, references=actual_test_list)
print(base_rouge_results)

print("Fine Tuned Model Predictions ROUGE Results:")
tuned_rouge_results = rouge.compute(predictions=tuned_predicted_text_list, references=actual_test_list)
print(tuned_rouge_results)
