<a href="https://colab.research.google.com/github/pranavkokati/GrammarFixer/blob/main/GrammarFixer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tqdm.auto import tqdm

In [None]:
# check for gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
bea19_df = pd.read_csv('bea_dataset.csv')

In [None]:
def clean_text(text):
    if isinstance(text, str):
        return re.sub(r"\s+", " ", text.strip())
    else:
        return "missing values"


In [None]:
data = pd.DataFrame({
    "incorrect": bea19_df["broken"].apply(clean_text),
    "corrected": bea19_df["sentence"].apply(clean_text)
})


In [None]:
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

In [None]:
# Custom Dataset class
class GrammarDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        incorrect = self.data.iloc[index]['incorrect']
        correct = self.data.iloc[index]['corrected']
        inputs = self.tokenizer.encode_plus(
            incorrect, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        targets = self.tokenizer.encode_plus(
            correct, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze(),
        }


In [None]:
# Initialize tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Hyperparameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 5e-5

In [None]:
# Prepare DataLoaders
train_dataset = GrammarDataset(train_data, tokenizer, MAX_LEN)
val_dataset = GrammarDataset(val_data, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
def train_epoch(epoch):
    model.train()
    total_loss = 0

    with tqdm(train_loader, unit="batch", desc=f"Epoch {epoch}") as tepoch:
        for batch in tepoch:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()


            tepoch.set_postfix(loss=loss.item())

    print(f"Epoch {epoch}: Training Loss = {total_loss / len(train_loader)}")

In [None]:
# Validation function with tqdm
def validate():
    model.eval()
    total_loss = 0

    with tqdm(val_loader, unit="batch", desc="Validation") as vepoch:
        with torch.no_grad():
            for batch in vepoch:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                total_loss += outputs.loss.item()

                vepoch.set_postfix(loss=outputs.loss.item())

    print(f"Validation Loss = {total_loss / len(val_loader)}")

In [None]:
# Train the model
for epoch in range(EPOCHS):
    train_epoch(epoch)
    validate()


Epoch 0:   0%|          | 0/4731 [00:00<?, ?batch/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 0: Training Loss = 0.14064569365570986


Validation:   0%|          | 0/526 [00:00<?, ?batch/s]

Validation Loss = 0.04668697940376888


Epoch 1:   0%|          | 0/4731 [00:00<?, ?batch/s]

Epoch 1: Training Loss = 0.04984445313373244


Validation:   0%|          | 0/526 [00:00<?, ?batch/s]

Validation Loss = 0.04285073952090718


Epoch 2:   0%|          | 0/4731 [00:00<?, ?batch/s]

Epoch 2: Training Loss = 0.045342921954387716


Validation:   0%|          | 0/526 [00:00<?, ?batch/s]

Validation Loss = 0.040740784934010114


In [None]:
# Save the model and tokenizer
model.save_pretrained("grammar_correction_t5_small")
tokenizer.save_pretrained("grammar_correction_t5_small")

print("Model saved successfully!")

Model saved successfully!


In [None]:
def correct_grammar(sentence):

  input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
  outputs = model.generate(input_ids)
  corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return corrected_sentence

sentence = input(str("Enter a sentence: "))
corrected_sentence = correct_grammar(sentence)
print(f"Original sentence: {sentence}")
print(f"Corrected sentence: {corrected_sentence}")