In [1]:
!pip install sentencepiece

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from tqdm.auto import tqdm


# Replace 'your_dataset.csv' with the path to your dataset
df = pd.read_csv('/kaggle/input/grammaratical-error-correction-dataset/train_updated.csv')
# df.to_csv('train_updated.csv')
df['input'] = "correct grammar: " + df['input']
class GrammarCorrectionDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=256):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input']
        target_text = self.data.iloc[idx]['target']

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        return {
            'input_ids': input_encoding.input_ids.squeeze(),
            'attention_mask': input_encoding.attention_mask.squeeze(),
            'labels': target_encoding.input_ids.squeeze(),
        }

tokenizer = T5Tokenizer.from_pretrained('t5-base')
dataset = GrammarCorrectionDataset(tokenizer, df)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

model = T5ForConditionalGeneration.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()


optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 8

for epoch in range(num_epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())

# Prediction Example

# Save the model and the tokenizer
model_save_path = '/kaggle/working/model.pt'
tokenizer_save_path = '/kaggle/working/tokenizer'

# Saving the model's state_dict
torch.save(model.state_dict(), model_save_path)

# Saving the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)
# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(tokenizer_save_path)

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Load the model's state_dict
model.load_state_dict(torch.load(model_save_path))
model.to(device)



model.eval()
input_text = "correct grammar: I want to talk about nocive or bad products like alcohol , hair spray and cigarrets ."
input_encoding = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

with torch.no_grad():
    output_sequences = model.generate(
        input_ids=input_encoding,
        max_length=40,
        temperature=1.0,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.0,
        do_sample=True,
        num_return_sequences=1
    )

corrected_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
print("Corrected Text:", corrected_text)




NameError: name 'uu' is not defined

In [None]:
model