In [None]:
!pip install transformers datasets
!pip install sacrebleu

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical  # Import for converting categorical labels
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from datasets import Dataset, load_metric
from torch.utils.data import DataLoader

In [None]:
# @title
df = pd.read_excel("en_train_updated.xlsx")
df = df.head(2000000)
df

In [None]:
# @title
df['before'] = df['before'].astype('str')
df['class'] = df['class'].astype('str')
df['after'] = df['after'].astype('str')

In [None]:
# @title
df = df[['class', 'before', 'after']]
dataset = Dataset.from_pandas(df)

In [None]:
train_dataset = dataset.train_test_split(test_size=0.2, seed=42)['train']
test_dataset = dataset.train_test_split(test_size=0.2, seed=42)['test']

tokenizer = T5Tokenizer.from_pretrained('t5-small')
metric = load_metric("sacrebleu")

def data_collator(examples):
    inputs = [f"convert: {example['class']} {example['before']} => " for example in examples]
    targets = [example['after'] for example in examples]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True, return_tensors='pt')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding=True, return_tensors='pt')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
def compute_metrics(pred, labels):
    pred_ids = pred.logits.argmax(-1)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # Convert labels tensor to list of strings
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Convert labels to list of lists
    labels_str = [[token] for token in labels_str]

    # Calculate BLEU score
    sacrebleu = metric.compute(predictions=pred_str, references=labels_str)["score"]

    return {"accuracy": sacrebleu}


In [None]:
# @title
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
num_epochs = 20
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_accuracy = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()
        train_accuracy += compute_metrics(outputs, batch["labels"])["accuracy"]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss /= len(train_loader)
    train_accuracy /= len(train_loader)

    model.eval()
    with torch.no_grad():
        test_loss = 0
        test_accuracy = 0
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            test_loss += loss.item()
            test_accuracy += compute_metrics(outputs, batch["labels"])["accuracy"]
        test_loss /= len(test_loader)
        test_accuracy /= len(test_loader)
        print(f"Epoch: {epoch+1}")
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}")
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}")
