In [None]:
from google.colab import drive
import os

drive.flush_and_unmount()

drive.mount('/content/drive')

!nvidia-smi

In [None]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

data = pd.read_csv('/content/drive/MyDrive/project/Jfleg4-2-4.csv')

train, test_data = train_test_split(data, test_size=0.2, random_state=42,shuffle=True)
train_data, val_data = train_test_split(train, test_size=0.125, random_state=42, shuffle=True)

# Data Acquisition
class GrammarCorrectionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # input_text = self.data.iloc[idx]['source']
        # output_text = self.data.iloc[idx]['output']
        input_text = self.data.iloc[idx]['input']
        output_text = self.data.iloc[idx]['target']


        input_encoding = tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer.encode(
            output_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(0),
            'attention_mask': input_encoding['attention_mask'].squeeze(0),
            'labels': target_encoding.squeeze(0)
        }

In [None]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_pythia = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
def evaluate(model, data_loader, device):
    model.to(device).eval()  # Set the model to evaluation mode

    # Confirm the pad_token_id is an integer, if not, set it manually
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100
    # print(f"Pad token ID: {pad_token_id}")

    total_loss = 0.0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Reshape labels to give a flat vector of length batch_size*seq_len
            labels = labels.view(-1)
            # Same for logits: flatten output predictions
            logits = logits.view(-1, model.config.vocab_size)

            # Calculate loss using CrossEntropyLoss
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
            loss = loss_fct(logits, labels)
            total_loss += loss.item()

            # Convert logits to predicted token indices
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy excluding padding
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    mask = all_labels != pad_token_id
    accuracy = accuracy_score(all_labels[mask], all_preds[mask])

    avg_loss = total_loss / len(data_loader)

    return avg_loss, accuracy


In [None]:
# Create the DataLoaders for the train and validation sets
batch_size = 8  # You can set the batch size to a suitable value
train_dataset = GrammarCorrectionDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = GrammarCorrectionDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Evaluate on the train set
train_loss, train_accuracy = evaluate(model_pythia, train_loader, device)
print(f"Training Loss: {train_loss:.4f} | Training Accuracy: {train_accuracy:.4f}")

# Evaluate on the validation set
val_loss, val_accuracy = evaluate(model_pythia, val_loader, device)
print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")