In [None]:
from google.colab import drive
import os

drive.flush_and_unmount()

drive.mount('/content/drive')

!nvidia-smi

In [None]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns


data = pd.read_csv('/content/drive/MyDrive/project/Jfleg4/Jfleg4-2-4.csv')

train, test_data = train_test_split(data, test_size=0.2, random_state=42,shuffle=True)
train_data, val_data = train_test_split(train, test_size=0.125, random_state=42, shuffle=True)

# Data Acquisition
class GrammarCorrectionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # input_text = self.data.iloc[idx]['source']
        # output_text = self.data.iloc[idx]['output']
        input_text = self.data.iloc[idx]['input']
        output_text = self.data.iloc[idx]['target']


        input_encoding = tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer.encode(
            output_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(0),
            'attention_mask': input_encoding['attention_mask'].squeeze(0),
            'labels': target_encoding.squeeze(0)
        }

In [None]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_pythia = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
from torch.optim import AdamW
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup

lr = 0.0004
epochs = 1
batch_size = 32

In [None]:
def train_and_evaluate(model, train_loader, val_loader, device, tokenizer, epochs=3):
    # Initialize the optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*epochs)

    model.to(device).eval()          # Send the student model to the device

    for epoch in range(epochs):

        model.train()
        total_train_loss = 0.0
        all_train_preds, all_train_labels = [], []

        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_train_loss += loss.item()

            # Gather predictions and true labels for accuracy calculation
            preds = torch.argmax(outputs.logits, dim=-1)
            all_train_preds.extend(preds.view(-1).cpu().numpy())
            all_train_labels.extend(labels.view(-1).cpu().numpy())

        # Compute training accuracy excluding padding
        train_accuracy = accuracy_score(all_train_labels, all_train_preds)

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation phase
        model.eval()
        total_val_loss = 0.0
        all_val_preds, all_val_labels = [], []

        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

                # Gather predictions and true labels for accuracy calculation
                preds = torch.argmax(outputs.logits, dim=-1)
                all_val_preds.extend(preds.view(-1).cpu().numpy())
                all_val_labels.extend(labels.view(-1).cpu().numpy())

        # Compute validation accuracy excluding padding
        val_accuracy = accuracy_score(all_val_labels, all_val_preds)

        avg_val_loss = total_val_loss / len(val_loader)

        # Print results for the epoch
        print(f"Epoch {epoch + 1}/{epochs}:")
        print(f"  T-Loss: {avg_train_loss:.4f} | T-Acc: {train_accuracy:.4f}")
        print(f"  V-Loss: {avg_val_loss:.4f} | V-Acc: {val_accuracy:.4f}")


In [None]:
model = model_pythia
model.to(device)

# Create training and validation sets
train_dataset = GrammarCorrectionDataset(train_data, tokenizer)
val_dataset = GrammarCorrectionDataset(val_data, tokenizer)
test_dataset = GrammarCorrectionDataset(test_data, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Call the training and evaluation function
train_and_evaluate(model, train_loader, val_loader, device, tokenizer, epochs=epochs)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_flan_T5_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
tokenizer.pad_token = tokenizer.eos_token

# Create training and validation sets
train_dataset = GrammarCorrectionDataset(train_data, tokenizer)
val_dataset = GrammarCorrectionDataset(val_data, tokenizer)
test_dataset = GrammarCorrectionDataset(test_data, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
model = model_flan_T5_base
model.to(device)

# Call the training and evaluation function
train_and_evaluate(model, train_loader, val_loader, device, tokenizer, epochs=epochs)